[llvm-branch-commits] [llvm] AMDGPU: Avoid default subtarget in generated codegen tests (2/9) (PR #205785)

Matt Arsenault via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Thu Jun 25 05:01:15 PDT 2026


https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/205785

Continue migrating away from testing the dummy target, and use
real targets approximating the old behavior. Performed by script.

Co-Authored-By: Claude <noreply at anthropic.com> (Claude-Opus-4.8)

>From 1c5aea0c24502dd2c29cbc8d752ed92935d1ad54 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 25 Jun 2026 11:16:15 +0200
Subject: [PATCH] AMDGPU: Avoid default subtarget in generated codegen tests
 (2/9)

Continue migrating away from testing the dummy target, and use
real targets approximating the old behavior. Performed by script.

Co-Authored-By: Claude <noreply at anthropic.com> (Claude-Opus-4.8)
---
 llvm/test/CodeGen/AMDGPU/icmp.i16.ll          |  182 +-
 .../AMDGPU/illegal-sgpr-to-vgpr-copy.ll       |    4 +-
 .../CodeGen/AMDGPU/indirect-addressing-si.ll  | 1701 +++++-----------
 llvm/test/CodeGen/AMDGPU/indirect-call.ll     |   64 +-
 .../CodeGen/AMDGPU/invalid-addrspacecast.ll   |    6 +-
 llvm/test/CodeGen/AMDGPU/ipra-regmask.ll      |    8 +-
 llvm/test/CodeGen/AMDGPU/kernel-args.ll       |  141 +-
 .../test/CodeGen/AMDGPU/kill-infinite-loop.ll |   12 +-
 .../CodeGen/AMDGPU/livevars-implicitdef.mir   |   12 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll   |   13 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll |  244 +--
 .../CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll    |   97 +-
 .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll       |   26 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll   |   24 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll   |   32 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll   |   21 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll   |    4 +-
 .../AMDGPU/llvm.amdgcn.sched.barrier.ll       |    4 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll  |   63 +-
 llvm/test/CodeGen/AMDGPU/llvm.mulo.ll         |  212 +-
 .../AMDGPU/llvm.r600.read.local.size.ll       |   20 +-
 llvm/test/CodeGen/AMDGPU/load-constant-f32.ll |    6 +-
 llvm/test/CodeGen/AMDGPU/load-constant-f64.ll |   18 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i1.ll  | 1284 ++++++------
 llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 1285 ++++++------
 llvm/test/CodeGen/AMDGPU/load-constant-i32.ll |  588 +++---
 llvm/test/CodeGen/AMDGPU/load-constant-i64.ll |   38 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i8.ll  | 1725 +++++++++--------
 llvm/test/CodeGen/AMDGPU/load-global-f32.ll   |   12 +-
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   | 1219 ++++++------
 llvm/test/CodeGen/AMDGPU/load-global-i32.ll   |  463 +++--
 llvm/test/CodeGen/AMDGPU/load-global-i8.ll    | 1454 +++++++-------
 llvm/test/CodeGen/AMDGPU/load-local-i16.ll    | 1251 ++++++------
 .../AMDGPU/long-branch-reserve-register.ll    |    3 +-
 llvm/test/CodeGen/AMDGPU/loop_break.ll        |   34 +-
 ...wer-control-flow-live-variables-update.mir |    2 +-
 .../AMDGPU/machinelicm-copy-like-instrs.mir   |    4 +-
 llvm/test/CodeGen/AMDGPU/mad_uint24.ll        |  100 +-
 .../CodeGen/AMDGPU/mcp-use-before-def.mir     |    2 +-
 .../memory-legalizer-atomic-insert-end.mir    |    2 +-
 .../memory-legalizer-store-infinite-loop.ll   |   14 +-
 llvm/test/CodeGen/AMDGPU/mul_int24.ll         |   61 +-
 llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll |   74 +-
 llvm/test/CodeGen/AMDGPU/multilevel-break.ll  |   12 +-
 .../test/CodeGen/AMDGPU/no-limit-coalesce.mir |    2 +-
 .../CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir  |    2 +-
 .../AMDGPU/optimize-exec-masking-pre-ra.mir   |    2 +-
 .../CodeGen/AMDGPU/optimize-negated-cond.ll   |   30 +-
 .../AMDGPU/peephole-opt-regseq-removal.mir    |    2 +-
 .../AMDGPU/pei-reg-scavenger-position.mir     |    2 +-
 llvm/test/CodeGen/AMDGPU/perfhint.ll          |    2 +-
 llvm/test/CodeGen/AMDGPU/rcp_iflag.ll         |   26 +-
 .../regcoalesce-cannot-join-failures.mir      |    2 +-
 ...keep-valid-lanes-implicit-def-bug39602.mir |    2 +-
 ...cer-resolve-lane-conflict-by-subranges.mir |    2 +-
 .../test/CodeGen/AMDGPU/remat-dead-subreg.mir |    2 +-
 .../AMDGPU/rewrite-partial-reg-uses-dbg.mir   |    4 +-
 .../AMDGPU/rewrite-partial-reg-uses-gen.mir   |    4 +-
 .../AMDGPU/rewrite-partial-reg-uses.mir       |    4 +-
 llvm/test/CodeGen/AMDGPU/rotate-add.ll        |   30 +-
 llvm/test/CodeGen/AMDGPU/rotl.ll              |   64 +-
 llvm/test/CodeGen/AMDGPU/rotr.ll              |   90 +-
 llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll  |   10 +-
 .../CodeGen/AMDGPU/select-constant-cttz.ll    |   20 +-
 .../CodeGen/AMDGPU/setcc-select-hi32mask.ll   |    2 +-
 llvm/test/CodeGen/AMDGPU/setcc-select.ll      |    2 +-
 .../CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir     |    2 +-
 .../CodeGen/AMDGPU/shrink-add-sub-constant.ll |  134 +-
 .../si-annotate-nested-control-flows.ll       |   16 +-
 llvm/test/CodeGen/AMDGPU/si-i1-copies.mir     |    2 +-
 .../CodeGen/AMDGPU/si-lower-control-flow.mir  |    2 +-
 .../CodeGen/AMDGPU/si-lower-i1-copies.mir     |    4 +-
 .../si-lower-sgpr-spills-cycle-header.mir     |    4 +-
 ...wer-sgpr-spills-initial-insert-in-body.mir |    2 +-
 ...er-sgpr-spills-initial-insert-in-latch.mir |    2 +-
 ...si-lower-sgpr-spills-multi-entry-cycle.mir |    4 +-
 .../AMDGPU/simplifydemandedbits-recursion.ll  |   34 +-
 llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll    |  252 ++-
 .../AMDGPU/skip-branch-taildup-ret.mir        |    2 +-
 .../AMDGPU/spill-empty-live-interval.mir      |    2 +-
 .../spill-partial-csr-sgpr-live-ins.mir       |    2 +-
 .../AMDGPU/spill-sgpr-csr-live-ins.mir        |    2 +-
 .../AMDGPU/splitkit-copy-live-lanes.mir       |    2 +-
 .../AMDGPU/splitkit-nolivesubranges.mir       |    2 +-
 .../AMDGPU/srem-seteq-illegal-types.ll        |   54 +-
 .../stop-tail-duplicate-cfg-intrinsic.mir     |    4 +-
 llvm/test/CodeGen/AMDGPU/tail-dup-bundle.mir  |    2 +-
 .../CodeGen/AMDGPU/trunc-bitcast-vector.ll    |    2 +-
 .../test/CodeGen/AMDGPU/trunc-cmp-constant.ll |    2 +-
 .../CodeGen/AMDGPU/twoaddr-regsequence.mir    |    2 +-
 llvm/test/CodeGen/AMDGPU/udivrem24.ll         |  532 ++---
 llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll    |  140 +-
 .../AMDGPU/undefined-subreg-liverange.ll      |    6 +-
 .../AMDGPU/unstructured-cfg-def-use-issue.ll  |   49 +-
 .../AMDGPU/urem-seteq-illegal-types.ll        |   32 +-
 llvm/test/CodeGen/AMDGPU/v_cndmask.ll         |  270 +--
 96 files changed, 6864 insertions(+), 7548 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
index fc4cdcda99ae4..05445f8311ec8 100644
--- a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn < %s| FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s| FileCheck -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s| FileCheck -check-prefix=GFX11-FAKE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s| FileCheck -check-prefix=GFX11-TRUE16 %s
 
@@ -35,20 +35,20 @@ define amdgpu_kernel void @i16_eq(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr
 ; SI-LABEL: i16_eq:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[8:11], 0 addr64
 ; SI-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
@@ -132,20 +132,20 @@ define amdgpu_kernel void @i16_ne(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr
 ; SI-LABEL: i16_ne:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[8:11], 0 addr64
 ; SI-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v3, v4
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v3
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
@@ -229,20 +229,20 @@ define amdgpu_kernel void @i16_ugt(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
 ; SI-LABEL: i16_ugt:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[8:11], 0 addr64
 ; SI-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_gt_u32_e32 vcc, v3, v4
+; SI-NEXT:    v_cmp_gt_u32_e32 vcc, v4, v3
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
@@ -326,20 +326,20 @@ define amdgpu_kernel void @i16_uge(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
 ; SI-LABEL: i16_uge:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[8:11], 0 addr64
 ; SI-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
+; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v3
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
@@ -423,20 +423,20 @@ define amdgpu_kernel void @i16_ult(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
 ; SI-LABEL: i16_ult:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[8:11], 0 addr64
 ; SI-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_lt_u32_e32 vcc, v3, v4
+; SI-NEXT:    v_cmp_lt_u32_e32 vcc, v4, v3
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
@@ -520,20 +520,20 @@ define amdgpu_kernel void @i16_ule(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
 ; SI-LABEL: i16_ule:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[8:11], 0 addr64
 ; SI-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_le_u32_e32 vcc, v3, v4
+; SI-NEXT:    v_cmp_le_u32_e32 vcc, v4, v3
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
@@ -618,20 +618,20 @@ define amdgpu_kernel void @i16_sgt(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
 ; SI-LABEL: i16_sgt:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_sshort v3, v[1:2], s[8:11], 0 addr64
 ; SI-NEXT:    buffer_load_sshort v4, v[1:2], s[4:7], 0 addr64
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_gt_i32_e32 vcc, v3, v4
+; SI-NEXT:    v_cmp_gt_i32_e32 vcc, v4, v3
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
@@ -715,20 +715,20 @@ define amdgpu_kernel void @i16_sge(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
 ; SI-LABEL: i16_sge:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_sshort v3, v[1:2], s[8:11], 0 addr64
 ; SI-NEXT:    buffer_load_sshort v4, v[1:2], s[4:7], 0 addr64
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_ge_i32_e32 vcc, v3, v4
+; SI-NEXT:    v_cmp_ge_i32_e32 vcc, v4, v3
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
@@ -812,20 +812,20 @@ define amdgpu_kernel void @i16_slt(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
 ; SI-LABEL: i16_slt:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_sshort v3, v[1:2], s[8:11], 0 addr64
 ; SI-NEXT:    buffer_load_sshort v4, v[1:2], s[4:7], 0 addr64
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, v3, v4
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, v4, v3
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
@@ -909,20 +909,20 @@ define amdgpu_kernel void @i16_sle(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
 ; SI-LABEL: i16_sle:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_sshort v3, v[1:2], s[8:11], 0 addr64
 ; SI-NEXT:    buffer_load_sshort v4, v[1:2], s[4:7], 0 addr64
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_le_i32_e32 vcc, v3, v4
+; SI-NEXT:    v_cmp_le_i32_e32 vcc, v4, v3
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
@@ -1007,12 +1007,12 @@ define amdgpu_kernel void @i16_eq_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_and_b32 s4, s8, 0xffff
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v3
@@ -1091,12 +1091,12 @@ define amdgpu_kernel void @i16_ne_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_and_b32 s4, s8, 0xffff
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v3
@@ -1175,12 +1175,12 @@ define amdgpu_kernel void @i16_ugt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_and_b32 s4, s8, 0xffff
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v3
@@ -1259,12 +1259,12 @@ define amdgpu_kernel void @i16_uge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_and_b32 s4, s8, 0xffff
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
@@ -1343,12 +1343,12 @@ define amdgpu_kernel void @i16_ult_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_and_b32 s4, s8, 0xffff
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v3
@@ -1427,12 +1427,12 @@ define amdgpu_kernel void @i16_ule_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_and_b32 s4, s8, 0xffff
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ge_u32_e32 vcc, s4, v3
@@ -1511,12 +1511,12 @@ define amdgpu_kernel void @i16_sgt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    buffer_load_sshort v3, v[1:2], s[4:7], 0 addr64
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_sext_i32_i16 s4, s8
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v3
@@ -1595,12 +1595,12 @@ define amdgpu_kernel void @i16_sge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    buffer_load_sshort v3, v[1:2], s[4:7], 0 addr64
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_sext_i32_i16 s4, s8
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_le_i32_e32 vcc, s4, v3
@@ -1679,12 +1679,12 @@ define amdgpu_kernel void @i16_slt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    buffer_load_sshort v3, v[1:2], s[4:7], 0 addr64
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_sext_i32_i16 s4, s8
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v3
@@ -1763,12 +1763,12 @@ define amdgpu_kernel void @i16_sle_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    buffer_load_sshort v3, v[1:2], s[4:7], 0 addr64
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_sext_i32_i16 s4, s8
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ge_i32_e32 vcc, s4, v3
diff --git a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
index 0b4e1c2359a7b..3e4cd5bf5f44d 100644
--- a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn < %s 2>&1 | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s 2>&1 | FileCheck -check-prefix=GCN %s
 
-; It is not illegal anymore because the SGPRs with divergent values are added with readfirstlane 
+; It is not illegal anymore because the SGPRs with divergent values are added with readfirstlane
 ; instruction in inline asm.
 define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_i32() #0 {
 ; GCN-LABEL: illegal_vgpr_to_sgpr_copy_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 1961ba53c1661..5d2259216e7f7 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GENERIC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GENERIC %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -O0 < %s | FileCheck -check-prefix=NOOPT %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI-MOVREL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI,VI-MOVREL %s
@@ -11,70 +11,29 @@
 define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) {
 ; GENERIC-LABEL: extract_w_offset:
 ; GENERIC:       ; %bb.0: ; %entry
+; GENERIC-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GENERIC-NEXT:    s_load_dword s4, s[4:5], 0xb
+; GENERIC-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
 ; GENERIC-NEXT:    s_mov_b32 s2, -1
-; GENERIC-NEXT:    v_mov_b32_e32 v0, 0x40400000
-; GENERIC-NEXT:    v_mov_b32_e32 v1, 0x40a00000
-; GENERIC-NEXT:    v_mov_b32_e32 v2, 0x40c00000
-; GENERIC-NEXT:    v_mov_b32_e32 v3, 0x40e00000
-; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x41000000
-; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x41100000
-; GENERIC-NEXT:    v_mov_b32_e32 v6, 0x41200000
-; GENERIC-NEXT:    v_mov_b32_e32 v7, 0x41300000
-; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x41400000
-; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x41500000
-; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41600000
-; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x41700000
-; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41800000
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_add_i32 s6, s4, 1
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 1
-; GENERIC-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e64 v13, 1.0, 2.0, s[4:5]
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 2
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 3
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, 4.0, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 4
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 5
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 6
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 7
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 8
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 9
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 10
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 11
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 12
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 13
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 14
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 15
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GENERIC-NEXT:    s_add_i32 m0, s6, 1
+; GENERIC-NEXT:    v_mov_b32_e32 v1, 2.0
+; GENERIC-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GENERIC-NEXT:    v_mov_b32_e32 v3, 4.0
+; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x40a00000
+; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; GENERIC-NEXT:    v_mov_b32_e32 v6, 0x40e00000
+; GENERIC-NEXT:    v_mov_b32_e32 v7, 0x41000000
+; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x41100000
+; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x41200000
+; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41300000
+; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x41400000
+; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41500000
+; GENERIC-NEXT:    v_mov_b32_e32 v13, 0x41600000
+; GENERIC-NEXT:    v_mov_b32_e32 v14, 0x41700000
+; GENERIC-NEXT:    v_mov_b32_e32 v15, 0x41800000
+; GENERIC-NEXT:    v_movrels_b32_e32 v0, v0
 ; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GENERIC-NEXT:    s_endpgm
 ;
@@ -275,12 +234,14 @@ entry:
 define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %out, i32 %in, <16 x i32> %or.val) {
 ; GENERIC-LABEL: extract_w_offset_salu_use_vector:
 ; GENERIC:       ; %bb.0: ; %entry
-; GENERIC-NEXT:    s_load_dword s2, s[4:5], 0xb
+; GENERIC-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; GENERIC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x19
 ; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
+; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_add_i32 s2, s2, 1
+; GENERIC-NEXT:    s_add_i32 m0, s6, 1
+; GENERIC-NEXT:    s_or_b32 s8, s8, 1
 ; GENERIC-NEXT:    s_or_b32 s4, s23, 16
 ; GENERIC-NEXT:    s_or_b32 s5, s22, 15
 ; GENERIC-NEXT:    s_or_b32 s6, s21, 14
@@ -295,40 +256,24 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou
 ; GENERIC-NEXT:    s_or_b32 s12, s12, 5
 ; GENERIC-NEXT:    s_or_b32 s11, s11, 4
 ; GENERIC-NEXT:    s_or_b32 s10, s10, 3
-; GENERIC-NEXT:    s_or_b32 s8, s8, 1
 ; GENERIC-NEXT:    s_or_b32 s9, s9, 2
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 1
-; GENERIC-NEXT:    s_cselect_b32 s8, s9, s8
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 2
-; GENERIC-NEXT:    s_cselect_b32 s8, s10, s8
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 3
-; GENERIC-NEXT:    s_cselect_b32 s8, s11, s8
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 4
-; GENERIC-NEXT:    s_cselect_b32 s8, s12, s8
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 5
-; GENERIC-NEXT:    s_cselect_b32 s8, s13, s8
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 6
-; GENERIC-NEXT:    s_cselect_b32 s8, s14, s8
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 7
-; GENERIC-NEXT:    s_cselect_b32 s8, s15, s8
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 8
-; GENERIC-NEXT:    s_cselect_b32 s8, s16, s8
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 9
-; GENERIC-NEXT:    s_cselect_b32 s8, s17, s8
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 10
-; GENERIC-NEXT:    s_cselect_b32 s8, s18, s8
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 11
-; GENERIC-NEXT:    s_cselect_b32 s8, s19, s8
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 12
-; GENERIC-NEXT:    s_cselect_b32 s7, s7, s8
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 13
-; GENERIC-NEXT:    s_cselect_b32 s6, s6, s7
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 14
-; GENERIC-NEXT:    s_cselect_b32 s5, s5, s6
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 15
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, s5
-; GENERIC-NEXT:    s_mov_b32 s2, -1
-; GENERIC-NEXT:    v_mov_b32_e32 v0, s4
+; GENERIC-NEXT:    v_mov_b32_e32 v0, s8
+; GENERIC-NEXT:    v_mov_b32_e32 v1, s9
+; GENERIC-NEXT:    v_mov_b32_e32 v2, s10
+; GENERIC-NEXT:    v_mov_b32_e32 v3, s11
+; GENERIC-NEXT:    v_mov_b32_e32 v4, s12
+; GENERIC-NEXT:    v_mov_b32_e32 v5, s13
+; GENERIC-NEXT:    v_mov_b32_e32 v6, s14
+; GENERIC-NEXT:    v_mov_b32_e32 v7, s15
+; GENERIC-NEXT:    v_mov_b32_e32 v8, s16
+; GENERIC-NEXT:    v_mov_b32_e32 v9, s17
+; GENERIC-NEXT:    v_mov_b32_e32 v10, s18
+; GENERIC-NEXT:    v_mov_b32_e32 v11, s19
+; GENERIC-NEXT:    v_mov_b32_e32 v12, s7
+; GENERIC-NEXT:    v_mov_b32_e32 v13, s6
+; GENERIC-NEXT:    v_mov_b32_e32 v14, s5
+; GENERIC-NEXT:    v_mov_b32_e32 v15, s4
+; GENERIC-NEXT:    v_movrels_b32_e32 v0, v0
 ; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GENERIC-NEXT:    s_endpgm
 ;
@@ -632,68 +577,28 @@ define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) {
 ; GENERIC-LABEL: extract_wo_offset:
 ; GENERIC:       ; %bb.0: ; %entry
 ; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GENERIC-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GENERIC-NEXT:    s_load_dword s4, s[4:5], 0xb
+; GENERIC-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
 ; GENERIC-NEXT:    s_mov_b32 s2, -1
-; GENERIC-NEXT:    v_mov_b32_e32 v0, 0x40400000
-; GENERIC-NEXT:    v_mov_b32_e32 v1, 0x40a00000
-; GENERIC-NEXT:    v_mov_b32_e32 v2, 0x40c00000
-; GENERIC-NEXT:    v_mov_b32_e32 v3, 0x40e00000
-; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x41000000
-; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x41100000
-; GENERIC-NEXT:    v_mov_b32_e32 v6, 0x41200000
-; GENERIC-NEXT:    v_mov_b32_e32 v7, 0x41300000
-; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x41400000
-; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x41500000
-; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41600000
-; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x41700000
-; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41800000
+; GENERIC-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 1
-; GENERIC-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e64 v13, 1.0, 2.0, s[4:5]
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 2
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 3
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, 4.0, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 4
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 5
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 6
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 7
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 8
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 9
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 10
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 11
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 12
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 13
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 14
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s6, 15
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GENERIC-NEXT:    s_mov_b32 m0, s4
+; GENERIC-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GENERIC-NEXT:    v_mov_b32_e32 v3, 4.0
+; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x40a00000
+; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; GENERIC-NEXT:    v_mov_b32_e32 v6, 0x40e00000
+; GENERIC-NEXT:    v_mov_b32_e32 v7, 0x41000000
+; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x41100000
+; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x41200000
+; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41300000
+; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x41400000
+; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41500000
+; GENERIC-NEXT:    v_mov_b32_e32 v13, 0x41600000
+; GENERIC-NEXT:    v_mov_b32_e32 v14, 0x41700000
+; GENERIC-NEXT:    v_mov_b32_e32 v15, 0x41800000
+; GENERIC-NEXT:    v_movrels_b32_e32 v0, v0
 ; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GENERIC-NEXT:    s_endpgm
 ;
@@ -888,43 +793,29 @@ entry:
 define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %offset) {
 ; GENERIC-LABEL: extract_neg_offset_sgpr:
 ; GENERIC:       ; %bb.0: ; %entry
-; GENERIC-NEXT:    s_load_dword s2, s[4:5], 0xb
 ; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GENERIC-NEXT:    s_load_dword s4, s[4:5], 0xb
+; GENERIC-NEXT:    v_mov_b32_e32 v0, 0
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
-; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_addk_i32 s2, 0xfe00
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 1
-; GENERIC-NEXT:    s_cselect_b32 s4, 1, 0
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 2
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 2
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 3
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 3
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 4
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 5
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 6
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 6
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 7
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 7
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 8
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 8
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 9
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 9
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 10
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 10
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 11
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 11
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 12
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 12
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 13
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 13
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 14
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 14
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 15
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 15
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 16
 ; GENERIC-NEXT:    s_mov_b32 s2, -1
-; GENERIC-NEXT:    v_mov_b32_e32 v0, s4
+; GENERIC-NEXT:    v_mov_b32_e32 v1, 1
+; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT:    s_add_i32 m0, s4, 0xfffffe00
+; GENERIC-NEXT:    v_mov_b32_e32 v2, 2
+; GENERIC-NEXT:    v_mov_b32_e32 v3, 3
+; GENERIC-NEXT:    v_mov_b32_e32 v4, 5
+; GENERIC-NEXT:    v_mov_b32_e32 v5, 6
+; GENERIC-NEXT:    v_mov_b32_e32 v6, 7
+; GENERIC-NEXT:    v_mov_b32_e32 v7, 8
+; GENERIC-NEXT:    v_mov_b32_e32 v8, 9
+; GENERIC-NEXT:    v_mov_b32_e32 v9, 10
+; GENERIC-NEXT:    v_mov_b32_e32 v10, 11
+; GENERIC-NEXT:    v_mov_b32_e32 v11, 12
+; GENERIC-NEXT:    v_mov_b32_e32 v12, 13
+; GENERIC-NEXT:    v_mov_b32_e32 v13, 14
+; GENERIC-NEXT:    v_mov_b32_e32 v14, 15
+; GENERIC-NEXT:    v_mov_b32_e32 v15, 16
+; GENERIC-NEXT:    v_movrels_b32_e32 v0, v0
 ; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GENERIC-NEXT:    s_endpgm
 ;
@@ -1123,8 +1014,12 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out,
 ; GENERIC:       ; %bb.0: ; %entry
 ; GENERIC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x19
 ; GENERIC-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x29
-; GENERIC-NEXT:    s_load_dword s2, s[4:5], 0x39
+; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GENERIC-NEXT:    s_load_dword s4, s[4:5], 0x39
+; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
+; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT:    s_or_b32 s8, s8, s36
 ; GENERIC-NEXT:    s_or_b32 s6, s23, s51
 ; GENERIC-NEXT:    s_or_b32 s7, s22, s50
 ; GENERIC-NEXT:    s_or_b32 s21, s21, s49
@@ -1139,44 +1034,25 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out,
 ; GENERIC-NEXT:    s_or_b32 s12, s12, s40
 ; GENERIC-NEXT:    s_or_b32 s11, s11, s39
 ; GENERIC-NEXT:    s_or_b32 s10, s10, s38
-; GENERIC-NEXT:    s_or_b32 s8, s8, s36
 ; GENERIC-NEXT:    s_or_b32 s9, s9, s37
-; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
-; GENERIC-NEXT:    s_addk_i32 s2, 0xfe00
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 1
-; GENERIC-NEXT:    s_cselect_b32 s4, s9, s8
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 2
-; GENERIC-NEXT:    s_cselect_b32 s4, s10, s4
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 3
-; GENERIC-NEXT:    s_cselect_b32 s4, s11, s4
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 4
-; GENERIC-NEXT:    s_cselect_b32 s4, s12, s4
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 5
-; GENERIC-NEXT:    s_cselect_b32 s4, s13, s4
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 6
-; GENERIC-NEXT:    s_cselect_b32 s4, s14, s4
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 7
-; GENERIC-NEXT:    s_cselect_b32 s4, s15, s4
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 8
-; GENERIC-NEXT:    s_cselect_b32 s4, s16, s4
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 9
-; GENERIC-NEXT:    s_cselect_b32 s4, s17, s4
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 10
-; GENERIC-NEXT:    s_cselect_b32 s4, s18, s4
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 11
-; GENERIC-NEXT:    s_cselect_b32 s4, s19, s4
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 12
-; GENERIC-NEXT:    s_cselect_b32 s4, s20, s4
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 13
-; GENERIC-NEXT:    s_cselect_b32 s4, s21, s4
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 14
-; GENERIC-NEXT:    s_cselect_b32 s4, s7, s4
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 15
-; GENERIC-NEXT:    s_cselect_b32 s4, s6, s4
-; GENERIC-NEXT:    s_mov_b32 s2, -1
-; GENERIC-NEXT:    v_mov_b32_e32 v0, s4
-; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT:    v_mov_b32_e32 v0, s8
+; GENERIC-NEXT:    s_add_i32 m0, s4, 0xfffffe00
+; GENERIC-NEXT:    v_mov_b32_e32 v1, s9
+; GENERIC-NEXT:    v_mov_b32_e32 v2, s10
+; GENERIC-NEXT:    v_mov_b32_e32 v3, s11
+; GENERIC-NEXT:    v_mov_b32_e32 v4, s12
+; GENERIC-NEXT:    v_mov_b32_e32 v5, s13
+; GENERIC-NEXT:    v_mov_b32_e32 v6, s14
+; GENERIC-NEXT:    v_mov_b32_e32 v7, s15
+; GENERIC-NEXT:    v_mov_b32_e32 v8, s16
+; GENERIC-NEXT:    v_mov_b32_e32 v9, s17
+; GENERIC-NEXT:    v_mov_b32_e32 v10, s18
+; GENERIC-NEXT:    v_mov_b32_e32 v11, s19
+; GENERIC-NEXT:    v_mov_b32_e32 v12, s20
+; GENERIC-NEXT:    v_mov_b32_e32 v13, s21
+; GENERIC-NEXT:    v_mov_b32_e32 v14, s7
+; GENERIC-NEXT:    v_mov_b32_e32 v15, s6
+; GENERIC-NEXT:    v_movrels_b32_e32 v0, v0
 ; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GENERIC-NEXT:    s_endpgm
 ;
@@ -1483,9 +1359,6 @@ entry:
 define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
 ; GENERIC-LABEL: extract_neg_offset_vgpr:
 ; GENERIC:       ; %bb.0: ; %entry
-; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
-; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    v_add_i32_e32 v0, vcc, 0xfffffe00, v0
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GENERIC-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1508,6 +1381,7 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
 ; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 10, v0
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 11, v1, vcc
 ; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 11, v0
+; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 12, v1, vcc
 ; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 12, v0
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 13, v1, vcc
@@ -1516,6 +1390,8 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
 ; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 14, v0
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 15, v1, vcc
 ; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v0
+; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
+; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v0, 16, v1, vcc
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
 ; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -1944,79 +1820,33 @@ entry:
 define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) {
 ; GENERIC-LABEL: insert_w_offset:
 ; GENERIC:       ; %bb.0: ; %entry
+; GENERIC-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GENERIC-NEXT:    s_load_dword s4, s[4:5], 0xb
-; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
-; GENERIC-NEXT:    s_mov_b32 s2, -1
-; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41880000
-; GENERIC-NEXT:    v_mov_b32_e32 v0, 0x40400000
-; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x41000000
-; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x40e00000
-; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x40c00000
-; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x40a00000
+; GENERIC-NEXT:    v_mov_b32_e32 v0, 1.0
+; GENERIC-NEXT:    v_mov_b32_e32 v1, 2.0
+; GENERIC-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT:    s_add_i32 m0, s6, 1
+; GENERIC-NEXT:    v_mov_b32_e32 v3, 4.0
+; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x40a00000
+; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; GENERIC-NEXT:    v_mov_b32_e32 v6, 0x40e00000
+; GENERIC-NEXT:    v_mov_b32_e32 v7, 0x41000000
+; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x41100000
+; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x41200000
+; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41300000
 ; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x41400000
-; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41300000
-; GENERIC-NEXT:    v_mov_b32_e32 v13, 0x41200000
-; GENERIC-NEXT:    v_mov_b32_e32 v14, 0x41100000
+; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41500000
+; GENERIC-NEXT:    v_mov_b32_e32 v13, 0x41600000
+; GENERIC-NEXT:    v_mov_b32_e32 v14, 0x41700000
 ; GENERIC-NEXT:    v_mov_b32_e32 v15, 0x41800000
-; GENERIC-NEXT:    v_mov_b32_e32 v16, 0x41700000
-; GENERIC-NEXT:    v_mov_b32_e32 v17, 0x41600000
-; GENERIC-NEXT:    v_mov_b32_e32 v18, 0x41500000
-; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_add_i32 s4, s4, 1
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 3
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 4.0, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 2
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v2, v0, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 1
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 2.0, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 0
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, 1.0, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 7
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v4, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 6
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v5, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 5
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 4
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, v9, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 11
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v11, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 10
+; GENERIC-NEXT:    v_mov_b32_e32 v16, 0x41880000
+; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
+; GENERIC-NEXT:    s_mov_b32 s2, -1
+; GENERIC-NEXT:    v_movreld_b32_e32 v0, v16
+; GENERIC-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; GENERIC-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v12, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 9
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v13, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 8
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v14, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 15
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v9, v15, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 14
-; GENERIC-NEXT:    buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v16, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 13
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v17, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 12
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v18, v10, vcc
-; GENERIC-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GENERIC-NEXT:    s_endpgm
 ;
@@ -2308,80 +2138,33 @@ entry:
 define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %out, i16 %in) {
 ; GENERIC-LABEL: insert_unsigned_base_plus_offset:
 ; GENERIC:       ; %bb.0: ; %entry
+; GENERIC-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GENERIC-NEXT:    s_load_dword s4, s[4:5], 0xb
-; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
-; GENERIC-NEXT:    s_mov_b32 s2, -1
-; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41880000
-; GENERIC-NEXT:    v_mov_b32_e32 v0, 0x40400000
-; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x41000000
-; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x40e00000
-; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x40c00000
-; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x40a00000
+; GENERIC-NEXT:    v_mov_b32_e32 v0, 1.0
+; GENERIC-NEXT:    v_mov_b32_e32 v1, 2.0
+; GENERIC-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT:    s_and_b32 m0, s6, 0xffff
+; GENERIC-NEXT:    v_mov_b32_e32 v3, 4.0
+; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x40a00000
+; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; GENERIC-NEXT:    v_mov_b32_e32 v6, 0x40e00000
+; GENERIC-NEXT:    v_mov_b32_e32 v7, 0x41000000
+; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x41100000
+; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x41200000
+; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41300000
 ; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x41400000
-; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41300000
-; GENERIC-NEXT:    v_mov_b32_e32 v13, 0x41200000
-; GENERIC-NEXT:    v_mov_b32_e32 v14, 0x41100000
+; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41500000
+; GENERIC-NEXT:    v_mov_b32_e32 v13, 0x41600000
+; GENERIC-NEXT:    v_mov_b32_e32 v14, 0x41700000
 ; GENERIC-NEXT:    v_mov_b32_e32 v15, 0x41800000
-; GENERIC-NEXT:    v_mov_b32_e32 v16, 0x41700000
-; GENERIC-NEXT:    v_mov_b32_e32 v17, 0x41600000
-; GENERIC-NEXT:    v_mov_b32_e32 v18, 0x41500000
-; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_and_b32 s4, s4, 0xffff
-; GENERIC-NEXT:    s_add_i32 s4, s4, 1
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 3
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 4.0, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 2
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v2, v0, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 1
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 2.0, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 0
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, 1.0, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 7
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v4, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 6
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v5, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 5
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 4
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, v9, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 11
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v11, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 10
+; GENERIC-NEXT:    v_mov_b32_e32 v16, 0x41880000
+; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
+; GENERIC-NEXT:    s_mov_b32 s2, -1
+; GENERIC-NEXT:    v_movreld_b32_e32 v1, v16
+; GENERIC-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; GENERIC-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v12, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 9
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v13, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 8
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v14, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 15
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v9, v15, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 14
-; GENERIC-NEXT:    buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v16, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 13
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v17, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 12
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v18, v10, vcc
-; GENERIC-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GENERIC-NEXT:    s_endpgm
 ;
@@ -2681,80 +2464,34 @@ entry:
 define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, i16 %in) {
 ; GENERIC-LABEL: insert_signed_base_plus_offset:
 ; GENERIC:       ; %bb.0: ; %entry
+; GENERIC-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GENERIC-NEXT:    s_load_dword s4, s[4:5], 0xb
-; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
-; GENERIC-NEXT:    s_mov_b32 s2, -1
-; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41880000
-; GENERIC-NEXT:    v_mov_b32_e32 v0, 0x40400000
-; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x41000000
-; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x40e00000
-; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x40c00000
-; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x40a00000
+; GENERIC-NEXT:    v_mov_b32_e32 v0, 1.0
+; GENERIC-NEXT:    v_mov_b32_e32 v1, 2.0
+; GENERIC-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT:    s_sext_i32_i16 s4, s6
+; GENERIC-NEXT:    s_add_i32 m0, s4, 1
+; GENERIC-NEXT:    v_mov_b32_e32 v3, 4.0
+; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x40a00000
+; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; GENERIC-NEXT:    v_mov_b32_e32 v6, 0x40e00000
+; GENERIC-NEXT:    v_mov_b32_e32 v7, 0x41000000
+; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x41100000
+; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x41200000
+; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41300000
 ; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x41400000
-; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41300000
-; GENERIC-NEXT:    v_mov_b32_e32 v13, 0x41200000
-; GENERIC-NEXT:    v_mov_b32_e32 v14, 0x41100000
+; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41500000
+; GENERIC-NEXT:    v_mov_b32_e32 v13, 0x41600000
+; GENERIC-NEXT:    v_mov_b32_e32 v14, 0x41700000
 ; GENERIC-NEXT:    v_mov_b32_e32 v15, 0x41800000
-; GENERIC-NEXT:    v_mov_b32_e32 v16, 0x41700000
-; GENERIC-NEXT:    v_mov_b32_e32 v17, 0x41600000
-; GENERIC-NEXT:    v_mov_b32_e32 v18, 0x41500000
-; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_sext_i32_i16 s4, s4
-; GENERIC-NEXT:    s_add_i32 s4, s4, 1
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 3
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 4.0, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 2
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v2, v0, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 1
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 2.0, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 0
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, 1.0, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 7
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v4, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 6
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v5, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 5
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 4
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, v9, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 11
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v11, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 10
+; GENERIC-NEXT:    v_mov_b32_e32 v16, 0x41880000
+; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
+; GENERIC-NEXT:    s_mov_b32 s2, -1
+; GENERIC-NEXT:    v_movreld_b32_e32 v0, v16
+; GENERIC-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; GENERIC-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v12, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 9
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v13, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 8
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v14, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 15
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v9, v15, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 14
-; GENERIC-NEXT:    buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v16, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 13
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v17, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 12
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v18, v10, vcc
-; GENERIC-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GENERIC-NEXT:    s_endpgm
 ;
@@ -3063,76 +2800,31 @@ define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) {
 ; GENERIC:       ; %bb.0: ; %entry
 ; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; GENERIC-NEXT:    s_load_dword s4, s[4:5], 0xb
-; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
-; GENERIC-NEXT:    s_mov_b32 s2, -1
-; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41880000
-; GENERIC-NEXT:    v_mov_b32_e32 v0, 0x40400000
-; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x41000000
-; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x40e00000
-; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x40c00000
-; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x40a00000
+; GENERIC-NEXT:    v_mov_b32_e32 v0, 1.0
+; GENERIC-NEXT:    v_mov_b32_e32 v1, 2.0
+; GENERIC-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GENERIC-NEXT:    v_mov_b32_e32 v3, 4.0
+; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x40a00000
+; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; GENERIC-NEXT:    v_mov_b32_e32 v6, 0x40e00000
+; GENERIC-NEXT:    v_mov_b32_e32 v7, 0x41000000
+; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x41100000
+; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x41200000
+; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41300000
 ; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x41400000
-; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41300000
-; GENERIC-NEXT:    v_mov_b32_e32 v13, 0x41200000
-; GENERIC-NEXT:    v_mov_b32_e32 v14, 0x41100000
+; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41500000
+; GENERIC-NEXT:    v_mov_b32_e32 v13, 0x41600000
+; GENERIC-NEXT:    v_mov_b32_e32 v14, 0x41700000
 ; GENERIC-NEXT:    v_mov_b32_e32 v15, 0x41800000
-; GENERIC-NEXT:    v_mov_b32_e32 v16, 0x41700000
-; GENERIC-NEXT:    v_mov_b32_e32 v17, 0x41600000
-; GENERIC-NEXT:    v_mov_b32_e32 v18, 0x41500000
+; GENERIC-NEXT:    v_mov_b32_e32 v16, 0x41880000
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 3
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 4.0, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 2
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v2, v0, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 1
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 2.0, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 0
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, 1.0, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 7
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v4, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 6
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v5, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 5
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 4
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, v9, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 11
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v11, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 10
+; GENERIC-NEXT:    s_mov_b32 m0, s4
+; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
+; GENERIC-NEXT:    s_mov_b32 s2, -1
+; GENERIC-NEXT:    v_movreld_b32_e32 v0, v16
+; GENERIC-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; GENERIC-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v12, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 9
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v13, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 8
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v14, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 15
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v9, v15, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 14
-; GENERIC-NEXT:    buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v16, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 13
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v17, v10, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 12
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v18, v10, vcc
-; GENERIC-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GENERIC-NEXT:    s_endpgm
 ;
@@ -3419,65 +3111,32 @@ entry:
 define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %offset) {
 ; GENERIC-LABEL: insert_neg_offset_sgpr:
 ; GENERIC:       ; %bb.0: ; %entry
-; GENERIC-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
+; GENERIC-NEXT:    s_load_dword s4, s[4:5], 0xd
+; GENERIC-NEXT:    v_mov_b32_e32 v0, 0
+; GENERIC-NEXT:    v_mov_b32_e32 v1, 1
+; GENERIC-NEXT:    v_mov_b32_e32 v2, 2
+; GENERIC-NEXT:    v_mov_b32_e32 v3, 3
+; GENERIC-NEXT:    v_mov_b32_e32 v4, 4
+; GENERIC-NEXT:    v_mov_b32_e32 v5, 5
+; GENERIC-NEXT:    v_mov_b32_e32 v6, 6
+; GENERIC-NEXT:    v_mov_b32_e32 v7, 7
+; GENERIC-NEXT:    v_mov_b32_e32 v8, 8
+; GENERIC-NEXT:    v_mov_b32_e32 v9, 9
+; GENERIC-NEXT:    v_mov_b32_e32 v10, 10
+; GENERIC-NEXT:    v_mov_b32_e32 v11, 11
+; GENERIC-NEXT:    v_mov_b32_e32 v12, 12
+; GENERIC-NEXT:    v_mov_b32_e32 v13, 13
+; GENERIC-NEXT:    v_mov_b32_e32 v14, 14
+; GENERIC-NEXT:    v_mov_b32_e32 v15, 15
+; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT:    s_add_i32 m0, s4, 0xfffffe00
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
 ; GENERIC-NEXT:    s_mov_b32 s2, -1
-; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_addk_i32 s6, 0xfe00
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 3
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 3
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 2
-; GENERIC-NEXT:    s_cselect_b32 s5, 16, 2
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 1
-; GENERIC-NEXT:    v_mov_b32_e32 v3, s4
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 1
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 0
-; GENERIC-NEXT:    v_mov_b32_e32 v2, s5
-; GENERIC-NEXT:    s_cselect_b32 s5, 16, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 7
-; GENERIC-NEXT:    v_mov_b32_e32 v1, s4
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 7
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 6
-; GENERIC-NEXT:    v_mov_b32_e32 v0, s5
-; GENERIC-NEXT:    s_cselect_b32 s5, 16, 6
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 5
-; GENERIC-NEXT:    v_mov_b32_e32 v7, s4
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 5
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 4
-; GENERIC-NEXT:    v_mov_b32_e32 v6, s5
-; GENERIC-NEXT:    s_cselect_b32 s5, 16, 4
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 11
-; GENERIC-NEXT:    v_mov_b32_e32 v5, s4
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 11
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 10
-; GENERIC-NEXT:    v_mov_b32_e32 v4, s5
+; GENERIC-NEXT:    v_movreld_b32_e32 v0, 16
+; GENERIC-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; GENERIC-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GENERIC-NEXT:    s_cselect_b32 s5, 16, 10
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 9
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_mov_b32_e32 v7, s4
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 9
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 8
-; GENERIC-NEXT:    v_mov_b32_e32 v6, s5
-; GENERIC-NEXT:    s_cselect_b32 s5, 16, 8
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 15
-; GENERIC-NEXT:    v_mov_b32_e32 v5, s4
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 15
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 14
-; GENERIC-NEXT:    v_mov_b32_e32 v4, s5
-; GENERIC-NEXT:    s_cselect_b32 s5, 16, 14
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 13
-; GENERIC-NEXT:    s_cselect_b32 s7, 16, 13
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 12
-; GENERIC-NEXT:    s_cselect_b32 s6, 16, 12
-; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_mov_b32_e32 v7, s4
-; GENERIC-NEXT:    v_mov_b32_e32 v6, s5
-; GENERIC-NEXT:    v_mov_b32_e32 v5, s7
-; GENERIC-NEXT:    v_mov_b32_e32 v4, s6
-; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GENERIC-NEXT:    s_endpgm
 ;
@@ -3779,67 +3438,34 @@ entry:
 define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ptr addrspace(1) %out, <16 x i32> %vec, i32 %offset) {
 ; GENERIC-LABEL: insert_neg_offset_sgpr_loadreg:
 ; GENERIC:       ; %bb.0: ; %entry
-; GENERIC-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0xb
-; GENERIC-NEXT:    s_load_dword s20, s[4:5], 0x29
-; GENERIC-NEXT:    s_load_dwordx16 s[0:15], s[4:5], 0x19
-; GENERIC-NEXT:    s_mov_b32 s19, 0xf000
-; GENERIC-NEXT:    s_mov_b32 s18, -1
+; GENERIC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x19
+; GENERIC-NEXT:    s_load_dword s6, s[4:5], 0x29
+; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
+; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
+; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_addk_i32 s20, 0xfe00
-; GENERIC-NEXT:    s_cmp_lg_u32 s20, 3
-; GENERIC-NEXT:    s_cselect_b32 s3, s3, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s20, 2
-; GENERIC-NEXT:    s_cselect_b32 s2, s2, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s20, 1
-; GENERIC-NEXT:    v_mov_b32_e32 v3, s3
-; GENERIC-NEXT:    s_cselect_b32 s1, s1, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s20, 0
-; GENERIC-NEXT:    v_mov_b32_e32 v2, s2
-; GENERIC-NEXT:    s_cselect_b32 s0, s0, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s20, 7
-; GENERIC-NEXT:    v_mov_b32_e32 v1, s1
-; GENERIC-NEXT:    s_cselect_b32 s1, s7, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s20, 6
-; GENERIC-NEXT:    v_mov_b32_e32 v0, s0
-; GENERIC-NEXT:    s_cselect_b32 s0, s6, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s20, 5
-; GENERIC-NEXT:    v_mov_b32_e32 v7, s1
-; GENERIC-NEXT:    s_cselect_b32 s1, s5, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s20, 4
-; GENERIC-NEXT:    v_mov_b32_e32 v6, s0
-; GENERIC-NEXT:    s_cselect_b32 s0, s4, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s20, 11
-; GENERIC-NEXT:    v_mov_b32_e32 v5, s1
-; GENERIC-NEXT:    s_cselect_b32 s1, s11, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s20, 10
-; GENERIC-NEXT:    v_mov_b32_e32 v4, s0
-; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
-; GENERIC-NEXT:    s_cselect_b32 s0, s10, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s20, 9
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_mov_b32_e32 v7, s1
-; GENERIC-NEXT:    s_cselect_b32 s1, s9, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s20, 8
-; GENERIC-NEXT:    v_mov_b32_e32 v6, s0
-; GENERIC-NEXT:    s_cselect_b32 s0, s8, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s20, 15
-; GENERIC-NEXT:    v_mov_b32_e32 v5, s1
-; GENERIC-NEXT:    s_cselect_b32 s1, s15, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s20, 14
-; GENERIC-NEXT:    v_mov_b32_e32 v4, s0
-; GENERIC-NEXT:    s_cselect_b32 s0, s14, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s20, 13
-; GENERIC-NEXT:    s_cselect_b32 s2, s13, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s20, 12
-; GENERIC-NEXT:    s_cselect_b32 s3, s12, 5
-; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:32
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_mov_b32_e32 v7, s1
-; GENERIC-NEXT:    v_mov_b32_e32 v6, s0
-; GENERIC-NEXT:    v_mov_b32_e32 v5, s2
-; GENERIC-NEXT:    v_mov_b32_e32 v4, s3
-; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:48
-; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GENERIC-NEXT:    v_mov_b32_e32 v0, s8
+; GENERIC-NEXT:    v_mov_b32_e32 v1, s9
+; GENERIC-NEXT:    v_mov_b32_e32 v2, s10
+; GENERIC-NEXT:    v_mov_b32_e32 v3, s11
+; GENERIC-NEXT:    v_mov_b32_e32 v4, s12
+; GENERIC-NEXT:    v_mov_b32_e32 v5, s13
+; GENERIC-NEXT:    v_mov_b32_e32 v6, s14
+; GENERIC-NEXT:    v_mov_b32_e32 v7, s15
+; GENERIC-NEXT:    v_mov_b32_e32 v8, s16
+; GENERIC-NEXT:    v_mov_b32_e32 v9, s17
+; GENERIC-NEXT:    v_mov_b32_e32 v10, s18
+; GENERIC-NEXT:    v_mov_b32_e32 v11, s19
+; GENERIC-NEXT:    v_mov_b32_e32 v12, s20
+; GENERIC-NEXT:    v_mov_b32_e32 v13, s21
+; GENERIC-NEXT:    v_mov_b32_e32 v14, s22
+; GENERIC-NEXT:    v_mov_b32_e32 v15, s23
+; GENERIC-NEXT:    s_add_i32 m0, s6, 0xfffffe00
+; GENERIC-NEXT:    v_movreld_b32_e32 v0, 5
+; GENERIC-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; GENERIC-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GENERIC-NEXT:    s_endpgm
 ;
 ; NOOPT-LABEL: insert_neg_offset_sgpr_loadreg:
@@ -4098,9 +3724,6 @@ entry:
 define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GENERIC-LABEL: insert_neg_offset_vgpr:
 ; GENERIC:       ; %bb.0: ; %entry
-; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
-; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
-; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    v_add_i32_e32 v12, vcc, 0xfffffe00, v0
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
 ; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 4, 33, vcc
@@ -4125,6 +3748,7 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v12
 ; GENERIC-NEXT:    v_cndmask_b32_e64 v9, 10, 33, vcc
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v12
+; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
 ; GENERIC-NEXT:    v_cndmask_b32_e64 v8, 9, 33, vcc
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v12
 ; GENERIC-NEXT:    v_cndmask_b32_e64 v15, 16, 33, vcc
@@ -4133,6 +3757,8 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v12
 ; GENERIC-NEXT:    v_cndmask_b32_e64 v13, 14, 33, vcc
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v12
+; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
+; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    v_cndmask_b32_e64 v12, 13, 33, vcc
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
@@ -4586,9 +4212,6 @@ entry:
 define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GENERIC-LABEL: insert_neg_inline_offset_vgpr:
 ; GENERIC:       ; %bb.0: ; %entry
-; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
-; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
-; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    v_add_i32_e32 v12, vcc, -16, v0
 ; GENERIC-NEXT:    v_mov_b32_e32 v16, 0x1f4
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
@@ -4614,6 +4237,7 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v12
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v9, 10, v16, vcc
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v12
+; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v8, 9, v16, vcc
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v12
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v15, 16, v16, vcc
@@ -4622,6 +4246,8 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v12
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v13, 14, v16, vcc
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v12
+; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
+; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v12, 13, v16, vcc
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
@@ -5080,92 +4706,92 @@ entry:
 define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) {
 ; GENERIC-LABEL: extract_vgpr_offset_multiple_in_block:
 ; GENERIC:       ; %bb.0: ; %entry
-; GENERIC-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
-; GENERIC-NEXT:    s_mov_b32 s10, 0
+; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GENERIC-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x9
+; GENERIC-NEXT:    s_mov_b32 s11, 0xf000
+; GENERIC-NEXT:    s_mov_b32 s2, 0
+; GENERIC-NEXT:    s_mov_b32 s3, s11
 ; GENERIC-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GENERIC-NEXT:    v_mov_b32_e32 v2, 0
-; GENERIC-NEXT:    s_mov_b32 s11, s3
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    buffer_load_dword v1, v[1:2], s[8:11], 0 addr64 glc
+; GENERIC-NEXT:    buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 glc
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GENERIC-NEXT:    s_mov_b32 s2, -1
+; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GENERIC-NEXT:    s_mov_b32 s10, -1
 ; GENERIC-NEXT:    ;;#ASMSTART
 ; GENERIC-NEXT:    s_mov_b32 s4, 17
 ; GENERIC-NEXT:    ;;#ASMEND
-; GENERIC-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
-; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 7, 9, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v1
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 11, v3, vcc
-; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GENERIC-NEXT:    v_cndmask_b32_e64 v4, 7, 9, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v1
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 13, v3, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 11, v4, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v1
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 5, v3, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 13, v4, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 5, v1
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 6, v3, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 6, v1
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 7, v3, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 5, v2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 6, v4, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 7, v1
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 8, v3, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 6, v2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 7, v4, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 8, v1
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 9, v3, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 7, v2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 8, v4, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 9, v1
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 10, v3, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 8, v2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 9, v4, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 10, v1
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 11, v3, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 9, v2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 10, v4, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 11, v1
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 12, v3, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 10, v2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 11, v4, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 12, v1
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 13, v3, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 11, v2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 12, v4, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 13, v1
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 14, v3, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 12, v2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 13, v4, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 14, v1
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 15, v3, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 13, v2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, 14, v4, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v1
-; GENERIC-NEXT:    v_cndmask_b32_e32 v1, 16, v3, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 14, v2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, 15, v4, vcc
-; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v2, 16, v3, vcc
-; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GENERIC-NEXT:    v_add_i32_e64 v0, s[0:1], 1, v1
+; GENERIC-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v1
+; GENERIC-NEXT:    v_cndmask_b32_e64 v2, 7, 9, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 2, v1
+; GENERIC-NEXT:    v_cndmask_b32_e64 v2, 11, v2, s[0:1]
+; GENERIC-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 7, 9, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v1
+; GENERIC-NEXT:    v_cndmask_b32_e64 v2, 13, v2, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 2, v0
+; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 11, v3, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 4, v1
+; GENERIC-NEXT:    v_cndmask_b32_e64 v2, 5, v2, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v0
+; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 13, v3, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 5, v1
+; GENERIC-NEXT:    v_cndmask_b32_e64 v2, 6, v2, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 4, v0
+; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 5, v3, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 6, v1
+; GENERIC-NEXT:    v_cndmask_b32_e64 v2, 7, v2, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 5, v0
+; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 6, v3, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 7, v1
+; GENERIC-NEXT:    v_cndmask_b32_e64 v2, 8, v2, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 6, v0
+; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 7, v3, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 8, v1
+; GENERIC-NEXT:    v_cndmask_b32_e64 v2, 9, v2, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 7, v0
+; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 8, v3, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 9, v1
+; GENERIC-NEXT:    v_cndmask_b32_e64 v2, 10, v2, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 8, v0
+; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 9, v3, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 10, v1
+; GENERIC-NEXT:    v_cndmask_b32_e64 v2, 11, v2, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 9, v0
+; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 10, v3, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 11, v1
+; GENERIC-NEXT:    v_cndmask_b32_e64 v2, 12, v2, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 10, v0
+; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 11, v3, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 12, v1
+; GENERIC-NEXT:    v_cndmask_b32_e64 v2, 13, v2, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 11, v0
+; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 12, v3, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 13, v1
+; GENERIC-NEXT:    v_cndmask_b32_e64 v2, 14, v2, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 12, v0
+; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 13, v3, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 14, v1
+; GENERIC-NEXT:    v_cndmask_b32_e64 v2, 15, v2, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 13, v0
+; GENERIC-NEXT:    v_cndmask_b32_e64 v3, 14, v3, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 15, v1
+; GENERIC-NEXT:    v_cndmask_b32_e64 v1, 16, v2, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 14, v0
+; GENERIC-NEXT:    v_cndmask_b32_e64 v2, 15, v3, s[0:1]
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 15, v0
+; GENERIC-NEXT:    v_cndmask_b32_e64 v0, 16, v2, s[0:1]
+; GENERIC-NEXT:    buffer_store_dword v1, off, s[8:11], 0
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; GENERIC-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GENERIC-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GENERIC-NEXT:    s_cbranch_execz .LBB16_2
 ; GENERIC-NEXT:  ; %bb.1: ; %bb1
+; GENERIC-NEXT:    s_waitcnt expcnt(0)
 ; GENERIC-NEXT:    v_mov_b32_e32 v0, s4
-; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
 ; GENERIC-NEXT:  .LBB16_2: ; %bb2
 ; GENERIC-NEXT:    s_endpgm
@@ -5823,22 +5449,21 @@ bb2:
 define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) {
 ; GENERIC-LABEL: insert_vgpr_offset_multiple_in_block:
 ; GENERIC:       ; %bb.0: ; %entry
-; GENERIC-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0xd
-; GENERIC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x19
+; GENERIC-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
-; GENERIC-NEXT:    s_mov_b32 s26, 0
+; GENERIC-NEXT:    s_mov_b32 s10, 0
+; GENERIC-NEXT:    s_mov_b32 s11, s3
 ; GENERIC-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GENERIC-NEXT:    v_mov_b32_e32 v2, 0
-; GENERIC-NEXT:    s_mov_b32 s27, s3
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    buffer_load_dword v14, v[1:2], s[24:27], 0 addr64 glc
+; GENERIC-NEXT:    buffer_load_dword v14, v[1:2], s[8:11], 0 addr64 glc
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
+; GENERIC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x19
 ; GENERIC-NEXT:    ;;#ASMSTART
 ; GENERIC-NEXT:    v_mov_b32 v1, 62
 ; GENERIC-NEXT:    ;;#ASMEND
-; GENERIC-NEXT:    v_mov_b32_e32 v10, s22
-; GENERIC-NEXT:    v_mov_b32_e32 v11, s23
-; GENERIC-NEXT:    v_mov_b32_e32 v15, s16
+; GENERIC-NEXT:    s_mov_b32 s2, -1
+; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
 ; GENERIC-NEXT:    v_mov_b32_e32 v2, s18
 ; GENERIC-NEXT:    v_mov_b32_e32 v3, s19
 ; GENERIC-NEXT:    v_mov_b32_e32 v4, s12
@@ -5849,6 +5474,9 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; GENERIC-NEXT:    v_mov_b32_e32 v9, s9
 ; GENERIC-NEXT:    v_mov_b32_e32 v12, s10
 ; GENERIC-NEXT:    v_mov_b32_e32 v13, s11
+; GENERIC-NEXT:    v_mov_b32_e32 v10, s22
+; GENERIC-NEXT:    v_mov_b32_e32 v11, s23
+; GENERIC-NEXT:    v_mov_b32_e32 v15, s16
 ; GENERIC-NEXT:    v_add_i32_e32 v18, vcc, 1, v14
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v14
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v16, v2, v1, vcc
@@ -5890,35 +5518,35 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v13, 63, v17, vcc
 ; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 10, v18
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v12, 63, v16, vcc
-; GENERIC-NEXT:    v_mov_b32_e32 v16, s17
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v14
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v19, v10, v1, vcc
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v14
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v17, v11, v1, vcc
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v14
+; GENERIC-NEXT:    v_mov_b32_e32 v16, s17
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v10, v15, v1, vcc
 ; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v14
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v11, v16, v1, vcc
 ; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 9, v18
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v11, 63, v11, vcc
 ; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 8, v18
+; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 15, v18
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v10, 63, v10, vcc
 ; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 14, v18
-; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 15, v18
 ; GENERIC-NEXT:    v_cndmask_b32_e64 v17, 63, v17, s[0:1]
-; GENERIC-NEXT:    v_cndmask_b32_e32 v16, 63, v19, vcc
-; GENERIC-NEXT:    v_mov_b32_e32 v19, s20
-; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v14
 ; GENERIC-NEXT:    v_mov_b32_e32 v15, s21
 ; GENERIC-NEXT:    v_cmp_eq_u32_e64 s[0:1], 13, v14
+; GENERIC-NEXT:    v_cndmask_b32_e32 v16, 63, v19, vcc
+; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v14
 ; GENERIC-NEXT:    v_cndmask_b32_e64 v14, v15, v1, s[0:1]
 ; GENERIC-NEXT:    v_cmp_ne_u32_e64 s[0:1], 13, v18
 ; GENERIC-NEXT:    v_cndmask_b32_e64 v15, 63, v14, s[0:1]
 ; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GENERIC-NEXT:    s_mov_b32 s2, -1
+; GENERIC-NEXT:    v_mov_b32_e32 v19, s20
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v14, v19, v1, vcc
 ; GENERIC-NEXT:    v_cmp_ne_u32_e32 vcc, 12, v18
 ; GENERIC-NEXT:    v_cndmask_b32_e32 v14, 63, v14, vcc
+; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
@@ -5928,7 +5556,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GENERIC-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GENERIC-NEXT:    s_cbranch_execz .LBB17_2
 ; GENERIC-NEXT:  ; %bb.1: ; %bb1
@@ -6763,134 +6390,56 @@ bb2:
 define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) {
 ; GENERIC-LABEL: insert_w_offset_multiple_in_block:
 ; GENERIC:       ; %bb.0: ; %entry
-; GENERIC-NEXT:    s_load_dwordx2 s[28:29], s[4:5], 0x9
-; GENERIC-NEXT:    s_load_dword s24, s[4:5], 0xb
-; GENERIC-NEXT:    s_mov_b32 s31, 0xf000
-; GENERIC-NEXT:    s_mov_b32 s30, -1
-; GENERIC-NEXT:    v_mov_b32_e32 v1, 0x41500000
-; GENERIC-NEXT:    v_mov_b32_e32 v0, 0x41880000
-; GENERIC-NEXT:    v_mov_b32_e32 v2, 0x41600000
-; GENERIC-NEXT:    v_mov_b32_e32 v3, 0x41700000
-; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x41800000
-; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x41100000
-; GENERIC-NEXT:    v_mov_b32_e32 v6, 0x41200000
-; GENERIC-NEXT:    v_mov_b32_e32 v7, 0x41300000
-; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x41400000
-; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x40a00000
-; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x40c00000
-; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x40e00000
-; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41000000
-; GENERIC-NEXT:    v_mov_b32_e32 v15, 0x40400000
+; GENERIC-NEXT:    s_load_dword s2, s[4:5], 0xb
+; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GENERIC-NEXT:    v_mov_b32_e32 v0, 1.0
+; GENERIC-NEXT:    v_mov_b32_e32 v1, 2.0
+; GENERIC-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GENERIC-NEXT:    v_mov_b32_e32 v3, 4.0
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_add_i32 s25, s24, 1
-; GENERIC-NEXT:    s_cmp_eq_u32 s25, 12
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s25, 13
-; GENERIC-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s25, 14
-; GENERIC-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s25, 15
-; GENERIC-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s25, 8
-; GENERIC-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s25, 9
-; GENERIC-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s25, 10
-; GENERIC-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s25, 11
-; GENERIC-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s25, 4
-; GENERIC-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s25, 5
-; GENERIC-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s25, 6
-; GENERIC-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s25, 7
-; GENERIC-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s25, 0
-; GENERIC-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e64 v13, 1.0, v0, s[22:23]
-; GENERIC-NEXT:    s_cmp_eq_u32 s25, 1
-; GENERIC-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e64 v14, 2.0, v0, s[22:23]
-; GENERIC-NEXT:    s_cmp_eq_u32 s25, 2
-; GENERIC-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e64 v15, v15, v0, s[22:23]
-; GENERIC-NEXT:    s_cmp_eq_u32 s25, 3
-; GENERIC-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e64 v16, 4.0, v0, s[22:23]
-; GENERIC-NEXT:    s_add_i32 s26, s24, 2
-; GENERIC-NEXT:    s_cmp_lg_u32 s26, 3
-; GENERIC-NEXT:    buffer_store_dwordx4 v[13:16], off, s[28:31], 0
-; GENERIC-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GENERIC-NEXT:    s_cmp_lg_u32 s26, 2
-; GENERIC-NEXT:    s_cselect_b64 s[24:25], -1, 0
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_cndmask_b32_e64 v16, v0, v16, s[22:23]
-; GENERIC-NEXT:    v_cndmask_b32_e64 v15, v0, v15, s[24:25]
-; GENERIC-NEXT:    s_cmp_lg_u32 s26, 1
-; GENERIC-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e64 v14, v0, v14, s[22:23]
-; GENERIC-NEXT:    s_cmp_lg_u32 s26, 0
-; GENERIC-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e64 v13, v0, v13, s[22:23]
-; GENERIC-NEXT:    v_cndmask_b32_e64 v9, v9, v0, s[14:15]
-; GENERIC-NEXT:    v_cndmask_b32_e64 v10, v10, v0, s[16:17]
-; GENERIC-NEXT:    v_cndmask_b32_e64 v11, v11, v0, s[18:19]
-; GENERIC-NEXT:    v_cndmask_b32_e64 v12, v12, v0, s[20:21]
-; GENERIC-NEXT:    buffer_store_dwordx4 v[9:12], off, s[28:31], 0 offset:16
-; GENERIC-NEXT:    s_cmp_lg_u32 s26, 7
-; GENERIC-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GENERIC-NEXT:    s_cmp_lg_u32 s26, 6
-; GENERIC-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_cndmask_b32_e64 v12, v0, v12, s[14:15]
-; GENERIC-NEXT:    v_cndmask_b32_e64 v11, v0, v11, s[16:17]
-; GENERIC-NEXT:    s_cmp_lg_u32 s26, 5
-; GENERIC-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e64 v10, v0, v10, s[14:15]
-; GENERIC-NEXT:    s_cmp_lg_u32 s26, 4
-; GENERIC-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e64 v9, v0, v9, s[14:15]
-; GENERIC-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; GENERIC-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[0:1]
-; GENERIC-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s[2:3]
-; GENERIC-NEXT:    v_cndmask_b32_e64 v4, v4, v0, s[4:5]
-; GENERIC-NEXT:    v_cndmask_b32_e64 v5, v5, v0, s[6:7]
-; GENERIC-NEXT:    buffer_store_dwordx4 v[1:4], off, s[28:31], 0 offset:48
-; GENERIC-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[8:9]
-; GENERIC-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[10:11]
-; GENERIC-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s[12:13]
-; GENERIC-NEXT:    buffer_store_dwordx4 v[5:8], off, s[28:31], 0 offset:32
-; GENERIC-NEXT:    s_cmp_lg_u32 s26, 11
-; GENERIC-NEXT:    buffer_store_dwordx4 v[9:12], off, s[28:31], 0 offset:80
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_waitcnt expcnt(1)
-; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s26, 10
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v0, v7, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s26, 9
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v0, v6, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s26, 8
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v0, v5, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s26, 15
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s26, 14
-; GENERIC-NEXT:    buffer_store_dwordx4 v[5:8], off, s[28:31], 0 offset:96
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s26, 13
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s26, 12
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
-; GENERIC-NEXT:    buffer_store_dwordx4 v[1:4], off, s[28:31], 0 offset:112
-; GENERIC-NEXT:    buffer_store_dwordx4 v[13:16], off, s[28:31], 0 offset:64
+; GENERIC-NEXT:    s_add_i32 m0, s2, 1
+; GENERIC-NEXT:    v_mov_b32_e32 v4, 0x40a00000
+; GENERIC-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; GENERIC-NEXT:    v_mov_b32_e32 v6, 0x40e00000
+; GENERIC-NEXT:    v_mov_b32_e32 v7, 0x41000000
+; GENERIC-NEXT:    v_mov_b32_e32 v8, 0x41100000
+; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x41200000
+; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x41300000
+; GENERIC-NEXT:    v_mov_b32_e32 v11, 0x41400000
+; GENERIC-NEXT:    v_mov_b32_e32 v12, 0x41500000
+; GENERIC-NEXT:    v_mov_b32_e32 v13, 0x41600000
+; GENERIC-NEXT:    v_mov_b32_e32 v14, 0x41700000
+; GENERIC-NEXT:    v_mov_b32_e32 v15, 0x41800000
+; GENERIC-NEXT:    v_mov_b32_e32 v32, 0x41880000
+; GENERIC-NEXT:    v_movreld_b32_e32 v0, v32
+; GENERIC-NEXT:    v_mov_b32_e32 v31, v15
+; GENERIC-NEXT:    s_add_i32 m0, s2, 2
+; GENERIC-NEXT:    v_mov_b32_e32 v30, v14
+; GENERIC-NEXT:    v_mov_b32_e32 v29, v13
+; GENERIC-NEXT:    v_mov_b32_e32 v28, v12
+; GENERIC-NEXT:    v_mov_b32_e32 v27, v11
+; GENERIC-NEXT:    v_mov_b32_e32 v26, v10
+; GENERIC-NEXT:    v_mov_b32_e32 v25, v9
+; GENERIC-NEXT:    v_mov_b32_e32 v24, v8
+; GENERIC-NEXT:    v_mov_b32_e32 v23, v7
+; GENERIC-NEXT:    v_mov_b32_e32 v22, v6
+; GENERIC-NEXT:    v_mov_b32_e32 v21, v5
+; GENERIC-NEXT:    v_mov_b32_e32 v20, v4
+; GENERIC-NEXT:    v_mov_b32_e32 v19, v3
+; GENERIC-NEXT:    v_mov_b32_e32 v18, v2
+; GENERIC-NEXT:    v_mov_b32_e32 v17, v1
+; GENERIC-NEXT:    v_mov_b32_e32 v16, v0
+; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
+; GENERIC-NEXT:    s_mov_b32 s2, -1
+; GENERIC-NEXT:    v_movreld_b32_e32 v16, v32
+; GENERIC-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; GENERIC-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GENERIC-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
+; GENERIC-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
+; GENERIC-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
+; GENERIC-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64
 ; GENERIC-NEXT:    s_endpgm
 ;
 ; NOOPT-LABEL: insert_w_offset_multiple_in_block:
@@ -7396,7 +6945,6 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
 ; GENERIC-NEXT:    ;;#ASMSTART
 ; GENERIC-NEXT:    ; reg use v[0:3]
 ; GENERIC-NEXT:    ;;#ASMEND
-; GENERIC-NEXT:    s_mov_b64 vcc, exec
 ; GENERIC-NEXT:    s_cbranch_execnz .LBB19_3
 ; GENERIC-NEXT:  .LBB19_2: ; %bb1
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
@@ -7413,7 +6961,6 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
 ; GENERIC-NEXT:    s_endpgm
 ; GENERIC-NEXT:  .LBB19_4:
-; GENERIC-NEXT:    s_mov_b64 vcc, 0
 ; GENERIC-NEXT:    s_branch .LBB19_2
 ;
 ; NOOPT-LABEL: extract_adjacent_blocks:
@@ -7635,7 +7182,6 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
 ; GENERIC-NEXT:    ;;#ASMSTART
 ; GENERIC-NEXT:    ; reg use v[0:3]
 ; GENERIC-NEXT:    ;;#ASMEND
-; GENERIC-NEXT:    s_mov_b64 vcc, exec
 ; GENERIC-NEXT:    s_cbranch_execnz .LBB20_3
 ; GENERIC-NEXT:  .LBB20_2: ; %bb1
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
@@ -7652,7 +7198,6 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
 ; GENERIC-NEXT:    s_endpgm
 ; GENERIC-NEXT:  .LBB20_4:
-; GENERIC-NEXT:    s_mov_b64 vcc, 0
 ; GENERIC-NEXT:    s_branch .LBB20_2
 ;
 ; NOOPT-LABEL: insert_adjacent_blocks:
@@ -7873,19 +7418,15 @@ define amdgpu_kernel void @multi_same_block(i32 %arg) {
 ; GENERIC-LABEL: multi_same_block:
 ; GENERIC:       ; %bb.0: ; %bb
 ; GENERIC-NEXT:    s_load_dword s0, s[4:5], 0x9
-; GENERIC-NEXT:    v_mov_b32_e32 v0, 0x41900000
-; GENERIC-NEXT:    v_mov_b32_e32 v1, 0x41b0cccd
+; GENERIC-NEXT:    v_mov_b32_e32 v1, 0x41900000
+; GENERIC-NEXT:    v_mov_b32_e32 v9, 0x41b0cccd
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_add_i32 s2, s0, -16
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 1
-; GENERIC-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e64 v0, v0, 4.0, s[0:1]
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 5
-; GENERIC-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e64 v1, v1, -4.0, s[0:1]
+; GENERIC-NEXT:    s_add_i32 m0, s0, -16
+; GENERIC-NEXT:    v_movreld_b32_e32 v0, 4.0
+; GENERIC-NEXT:    v_movreld_b32_e32 v4, -4.0
 ; GENERIC-NEXT:    s_mov_b32 m0, -1
-; GENERIC-NEXT:    ds_write_b32 v0, v0
 ; GENERIC-NEXT:    ds_write_b32 v0, v1
+; GENERIC-NEXT:    ds_write_b32 v0, v9
 ; GENERIC-NEXT:    s_endpgm
 ;
 ; NOOPT-LABEL: multi_same_block:
@@ -8042,72 +7583,28 @@ bb:
 define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
 ; GENERIC-LABEL: extract_largest_inbounds_offset:
 ; GENERIC:       ; %bb.0: ; %entry
-; GENERIC-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
-; GENERIC-NEXT:    s_mov_b32 s2, -1
+; GENERIC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GENERIC-NEXT:    s_load_dword s12, s[4:5], 0xd
-; GENERIC-NEXT:    s_mov_b32 s6, s2
-; GENERIC-NEXT:    s_mov_b32 s7, s3
+; GENERIC-NEXT:    s_mov_b32 s7, 0xf000
+; GENERIC-NEXT:    s_mov_b32 s6, -1
+; GENERIC-NEXT:    s_mov_b32 s10, s6
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_mov_b32 s0, s8
-; GENERIC-NEXT:    s_mov_b32 s1, s9
-; GENERIC-NEXT:    s_mov_b32 s4, s10
-; GENERIC-NEXT:    s_mov_b32 s5, s11
-; GENERIC-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc
+; GENERIC-NEXT:    s_mov_b32 s8, s2
+; GENERIC-NEXT:    s_mov_b32 s9, s3
+; GENERIC-NEXT:    s_mov_b32 s11, s7
+; GENERIC-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 glc
+; GENERIC-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 glc
+; GENERIC-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 glc
+; GENERIC-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    s_add_i32 s12, s12, 15
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 1
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 3
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 4
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 5
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 6
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 7
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 8
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 9
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 10
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 11
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 12
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 13
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 14
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 15
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v15, vcc
-; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT:    s_add_i32 m0, s12, 15
+; GENERIC-NEXT:    s_mov_b32 s4, s0
+; GENERIC-NEXT:    s_mov_b32 s5, s1
+; GENERIC-NEXT:    v_movrels_b32_e32 v0, v0
+; GENERIC-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GENERIC-NEXT:    s_endpgm
 ;
 ; NOOPT-LABEL: extract_largest_inbounds_offset:
@@ -8349,72 +7846,28 @@ entry:
 define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
 ; GENERIC-LABEL: extract_out_of_bounds_offset:
 ; GENERIC:       ; %bb.0: ; %entry
-; GENERIC-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
-; GENERIC-NEXT:    s_mov_b32 s2, -1
+; GENERIC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GENERIC-NEXT:    s_load_dword s12, s[4:5], 0xd
-; GENERIC-NEXT:    s_mov_b32 s6, s2
-; GENERIC-NEXT:    s_mov_b32 s7, s3
+; GENERIC-NEXT:    s_mov_b32 s7, 0xf000
+; GENERIC-NEXT:    s_mov_b32 s6, -1
+; GENERIC-NEXT:    s_mov_b32 s10, s6
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_mov_b32 s0, s8
-; GENERIC-NEXT:    s_mov_b32 s1, s9
-; GENERIC-NEXT:    s_mov_b32 s4, s10
-; GENERIC-NEXT:    s_mov_b32 s5, s11
-; GENERIC-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc
+; GENERIC-NEXT:    s_mov_b32 s8, s2
+; GENERIC-NEXT:    s_mov_b32 s9, s3
+; GENERIC-NEXT:    s_mov_b32 s11, s7
+; GENERIC-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 glc
+; GENERIC-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 glc
+; GENERIC-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 glc
+; GENERIC-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    s_add_i32 s12, s12, 16
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 1
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 3
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 4
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 5
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 6
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 7
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 8
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 9
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 10
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 11
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 12
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 13
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 14
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s12, 15
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v15, vcc
-; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT:    s_add_i32 m0, s12, 16
+; GENERIC-NEXT:    s_mov_b32 s4, s0
+; GENERIC-NEXT:    s_mov_b32 s5, s1
+; GENERIC-NEXT:    v_movrels_b32_e32 v0, v0
+; GENERIC-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GENERIC-NEXT:    s_endpgm
 ;
 ; NOOPT-LABEL: extract_out_of_bounds_offset:
@@ -8656,73 +8109,28 @@ entry:
 define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx.in) {
 ; GENERIC-LABEL: extractelement_v16i32_or_index:
 ; GENERIC:       ; %bb.0: ; %entry
-; GENERIC-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
-; GENERIC-NEXT:    s_mov_b32 s2, -1
+; GENERIC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GENERIC-NEXT:    s_load_dword s12, s[4:5], 0xd
-; GENERIC-NEXT:    s_mov_b32 s6, s2
-; GENERIC-NEXT:    s_mov_b32 s7, s3
+; GENERIC-NEXT:    s_mov_b32 s7, 0xf000
+; GENERIC-NEXT:    s_mov_b32 s6, -1
+; GENERIC-NEXT:    s_mov_b32 s10, s6
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_mov_b32 s0, s8
-; GENERIC-NEXT:    s_mov_b32 s1, s9
-; GENERIC-NEXT:    s_mov_b32 s4, s10
-; GENERIC-NEXT:    s_mov_b32 s5, s11
-; GENERIC-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc
+; GENERIC-NEXT:    s_mov_b32 s8, s2
+; GENERIC-NEXT:    s_mov_b32 s9, s3
+; GENERIC-NEXT:    s_mov_b32 s11, s7
+; GENERIC-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 glc
+; GENERIC-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 glc
+; GENERIC-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 glc
+; GENERIC-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    s_lshl_b32 s4, s12, 2
-; GENERIC-NEXT:    s_or_b32 s4, s4, 1
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 1
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 2
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 3
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 4
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 5
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 6
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 7
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 8
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 9
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 10
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 11
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 12
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 13
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 14
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 15
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v0, v15, vcc
-; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT:    s_lshl_b32 m0, s12, 2
+; GENERIC-NEXT:    s_mov_b32 s4, s0
+; GENERIC-NEXT:    s_mov_b32 s5, s1
+; GENERIC-NEXT:    v_movrels_b32_e32 v0, v1
+; GENERIC-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GENERIC-NEXT:    s_endpgm
 ;
 ; NOOPT-LABEL: extractelement_v16i32_or_index:
@@ -8965,84 +8373,34 @@ entry:
 define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, <16 x float> %a, i32 %idx.in) nounwind {
 ; GENERIC-LABEL: insertelement_v16f32_or_index:
 ; GENERIC:       ; %bb.0:
-; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GENERIC-NEXT:    s_load_dword s6, s[4:5], 0x29
 ; GENERIC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x19
-; GENERIC-NEXT:    s_load_dword s4, s[4:5], 0x29
+; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GENERIC-NEXT:    v_mov_b32_e32 v16, 0x40a00000
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
-; GENERIC-NEXT:    s_mov_b32 s2, -1
-; GENERIC-NEXT:    v_mov_b32_e32 v10, 0x40a00000
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_lshl_b32 s4, s4, 2
-; GENERIC-NEXT:    v_mov_b32_e32 v0, s11
-; GENERIC-NEXT:    v_mov_b32_e32 v1, s10
-; GENERIC-NEXT:    v_mov_b32_e32 v4, s9
-; GENERIC-NEXT:    v_mov_b32_e32 v5, s8
-; GENERIC-NEXT:    v_mov_b32_e32 v6, s15
-; GENERIC-NEXT:    v_mov_b32_e32 v8, s14
-; GENERIC-NEXT:    v_mov_b32_e32 v9, s13
-; GENERIC-NEXT:    v_mov_b32_e32 v11, s12
-; GENERIC-NEXT:    v_mov_b32_e32 v12, s19
-; GENERIC-NEXT:    v_mov_b32_e32 v13, s18
-; GENERIC-NEXT:    v_mov_b32_e32 v14, s17
-; GENERIC-NEXT:    v_mov_b32_e32 v15, s16
-; GENERIC-NEXT:    v_mov_b32_e32 v16, s23
-; GENERIC-NEXT:    v_mov_b32_e32 v17, s22
-; GENERIC-NEXT:    v_mov_b32_e32 v18, s21
-; GENERIC-NEXT:    v_mov_b32_e32 v19, s20
-; GENERIC-NEXT:    s_or_b32 s4, s4, 1
-; GENERIC-NEXT:    s_cmp_lg_u32 s4, 3
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v3, v10, v0, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s4, 2
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v2, v10, v1, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s4, 1
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v1, v10, v4, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s4, 0
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v0, v10, v5, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s4, 7
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v10, v6, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s4, 6
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v10, v8, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s4, 5
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s4, 4
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v4, v10, v11, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s4, 11
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v10, v12, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s4, 10
+; GENERIC-NEXT:    s_lshl_b32 m0, s6, 2
+; GENERIC-NEXT:    v_mov_b32_e32 v0, s8
+; GENERIC-NEXT:    v_mov_b32_e32 v1, s9
+; GENERIC-NEXT:    v_mov_b32_e32 v2, s10
+; GENERIC-NEXT:    v_mov_b32_e32 v3, s11
+; GENERIC-NEXT:    v_mov_b32_e32 v4, s12
+; GENERIC-NEXT:    v_mov_b32_e32 v5, s13
+; GENERIC-NEXT:    v_mov_b32_e32 v6, s14
+; GENERIC-NEXT:    v_mov_b32_e32 v7, s15
+; GENERIC-NEXT:    v_mov_b32_e32 v8, s16
+; GENERIC-NEXT:    v_mov_b32_e32 v9, s17
+; GENERIC-NEXT:    v_mov_b32_e32 v10, s18
+; GENERIC-NEXT:    v_mov_b32_e32 v11, s19
+; GENERIC-NEXT:    v_mov_b32_e32 v12, s20
+; GENERIC-NEXT:    v_mov_b32_e32 v13, s21
+; GENERIC-NEXT:    v_mov_b32_e32 v14, s22
+; GENERIC-NEXT:    v_mov_b32_e32 v15, s23
+; GENERIC-NEXT:    s_mov_b32 s2, -1
+; GENERIC-NEXT:    v_movreld_b32_e32 v1, v16
+; GENERIC-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; GENERIC-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v10, v13, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s4, 9
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v10, v14, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s4, 8
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v5, v10, v15, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s4, 15
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v9, v10, v16, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s4, 14
-; GENERIC-NEXT:    buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_cndmask_b32_e32 v8, v10, v17, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s4, 13
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, v10, v18, vcc
-; GENERIC-NEXT:    s_cmp_lg_u32 s4, 12
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, v10, v19, vcc
-; GENERIC-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GENERIC-NEXT:    s_endpgm
 ;
@@ -9309,29 +8667,39 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; GENERIC-LABEL: broken_phi_bb:
 ; GENERIC:       ; %bb.0: ; %bb
 ; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GENERIC-NEXT:    s_mov_b32 s6, 8
+; GENERIC-NEXT:    v_mov_b32_e32 v0, 8
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
 ; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    s_branch .LBB26_2
-; GENERIC-NEXT:  .LBB26_1: ; %Flow
-; GENERIC-NEXT:    ; in Loop: Header=BB26_2 Depth=1
-; GENERIC-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GENERIC-NEXT:    s_cbranch_vccz .LBB26_4
+; GENERIC-NEXT:  .LBB26_1:
+; GENERIC-NEXT:    ; implicit-def: $vgpr0
+; GENERIC-NEXT:    s_branch .LBB26_6
 ; GENERIC-NEXT:  .LBB26_2: ; %bb2
-; GENERIC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GENERIC-NEXT:    ; =>This Loop Header: Depth=1
+; GENERIC-NEXT:    ; Child Loop BB26_4 Depth 2
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_cmp_ge_i32 s6, s0
-; GENERIC-NEXT:    s_mov_b64 s[4:5], -1
-; GENERIC-NEXT:    ; implicit-def: $sgpr6
-; GENERIC-NEXT:    s_cbranch_scc1 .LBB26_1
+; GENERIC-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
+; GENERIC-NEXT:    s_cbranch_vccnz .LBB26_1
 ; GENERIC-NEXT:  ; %bb.3: ; %bb4
 ; GENERIC-NEXT:    ; in Loop: Header=BB26_2 Depth=1
-; GENERIC-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GENERIC-NEXT:    buffer_load_dword v16, off, s[0:3], 0 glc
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    s_mov_b64 s[4:5], 0
-; GENERIC-NEXT:    s_mov_b32 s6, s1
-; GENERIC-NEXT:    s_branch .LBB26_1
-; GENERIC-NEXT:  .LBB26_4: ; %bb8
+; GENERIC-NEXT:    v_mov_b32_e32 v17, s1
+; GENERIC-NEXT:    s_mov_b64 s[4:5], exec
+; GENERIC-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GENERIC-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
+; GENERIC-NEXT:    ; => This Inner Loop Header: Depth=2
+; GENERIC-NEXT:    v_readfirstlane_b32 s6, v16
+; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v16
+; GENERIC-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GENERIC-NEXT:    s_mov_b32 m0, s6
+; GENERIC-NEXT:    v_movreld_b32_e32 v0, v17
+; GENERIC-NEXT:    s_xor_b64 exec, exec, vcc
+; GENERIC-NEXT:    s_cbranch_execnz .LBB26_4
+; GENERIC-NEXT:  ; %bb.5: ; in Loop: Header=BB26_2 Depth=1
+; GENERIC-NEXT:    s_mov_b64 exec, s[4:5]
+; GENERIC-NEXT:    s_cbranch_execnz .LBB26_2
+; GENERIC-NEXT:  .LBB26_6: ; %bb8
 ; GENERIC-NEXT:    s_endpgm
 ;
 ; NOOPT-LABEL: broken_phi_bb:
@@ -9755,65 +9123,40 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
 ; GENERIC-NEXT:    v_mov_b32_e32 v2, s4
 ; GENERIC-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GENERIC-NEXT:    s_mov_b32 s2, 0
+; GENERIC-NEXT:    v_mov_b32_e32 v5, 0
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
 ; GENERIC-NEXT:    s_mov_b32 s0, s2
 ; GENERIC-NEXT:    s_mov_b32 s1, s2
+; GENERIC-NEXT:    v_mov_b32_e32 v6, v5
+; GENERIC-NEXT:    v_mov_b32_e32 v7, v5
+; GENERIC-NEXT:    v_mov_b32_e32 v8, v5
+; GENERIC-NEXT:    v_mov_b32_e32 v9, v5
+; GENERIC-NEXT:    v_mov_b32_e32 v10, v5
+; GENERIC-NEXT:    v_mov_b32_e32 v11, v5
+; GENERIC-NEXT:    v_mov_b32_e32 v12, v5
+; GENERIC-NEXT:    v_mov_b32_e32 v13, v5
+; GENERIC-NEXT:    v_mov_b32_e32 v14, v5
+; GENERIC-NEXT:    v_mov_b32_e32 v15, v5
+; GENERIC-NEXT:    v_mov_b32_e32 v16, v5
+; GENERIC-NEXT:    v_mov_b32_e32 v17, v5
+; GENERIC-NEXT:    v_mov_b32_e32 v18, v5
+; GENERIC-NEXT:    v_mov_b32_e32 v19, v5
+; GENERIC-NEXT:    v_mov_b32_e32 v20, v5
+; GENERIC-NEXT:    s_mov_b64 s[4:5], exec
+; GENERIC-NEXT:  .LBB27_1: ; =>This Inner Loop Header: Depth=1
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
-; GENERIC-NEXT:    v_readfirstlane_b32 s4, v2
-; GENERIC-NEXT:    s_or_b32 s4, s4, 1
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 3
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 2
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v7, 0, v4, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 1
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v6, 0, v4, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 0
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 7
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v12, 0, v4, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 6
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v11, 0, v4, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 5
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 4
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v9, 0, v4, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 11
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v13, 0, v4, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 10
+; GENERIC-NEXT:    v_readfirstlane_b32 s6, v2
+; GENERIC-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v2
+; GENERIC-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GENERIC-NEXT:    s_mov_b32 m0, s6
+; GENERIC-NEXT:    v_movreld_b32_e32 v6, v4
+; GENERIC-NEXT:    s_xor_b64 exec, exec, vcc
+; GENERIC-NEXT:    s_cbranch_execnz .LBB27_1
+; GENERIC-NEXT:  ; %bb.2:
+; GENERIC-NEXT:    s_mov_b64 exec, s[4:5]
+; GENERIC-NEXT:    buffer_store_dwordx4 v[17:20], v[0:1], s[0:3], 0 addr64 offset:48
+; GENERIC-NEXT:    buffer_store_dwordx4 v[13:16], v[0:1], s[0:3], 0 addr64 offset:32
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[9:12], v[0:1], s[0:3], 0 addr64 offset:16
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_cndmask_b32_e32 v12, 0, v4, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 9
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v11, 0, v4, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 8
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 15
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v14, 0, v4, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 14
-; GENERIC-NEXT:    buffer_store_dwordx4 v[10:13], v[0:1], s[0:3], 0 addr64 offset:32
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_cndmask_b32_e32 v13, 0, v4, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 13
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v12, 0, v4, vcc
-; GENERIC-NEXT:    s_cmp_eq_u32 s4, 12
-; GENERIC-NEXT:    s_cselect_b64 vcc, -1, 0
-; GENERIC-NEXT:    v_cndmask_b32_e32 v11, 0, v4, vcc
-; GENERIC-NEXT:    buffer_store_dwordx4 v[11:14], v[0:1], s[0:3], 0 addr64 offset:48
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[5:8], v[0:1], s[0:3], 0 addr64
 ; GENERIC-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index ef51ec106a935..27dea55da5d97 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel < %s | FileCheck -check-prefix=GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -global-isel < %s | FileCheck -check-prefix=GISEL %s
 
 @gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
 @gv.fptr1 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
@@ -8,50 +8,50 @@
 define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) #0 {
 ; GCN-LABEL: test_indirect_call_sgpr_ptr:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GCN-NEXT:    s_add_u32 s0, s0, s17
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    s_mov_b32 s13, s15
 ; GCN-NEXT:    s_mov_b32 s12, s14
 ; GCN-NEXT:    s_getpc_b64 s[14:15]
 ; GCN-NEXT:    s_add_u32 s14, s14, gv.fptr0 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s15, s15, gv.fptr0 at rel32@hi+12
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GCN-NEXT:    s_load_dwordx2 s[18:19], s[14:15], 0x0
-; GCN-NEXT:    s_add_u32 s8, s8, 8
-; GCN-NEXT:    s_addc_u32 s9, s9, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GCN-NEXT:    s_add_u32 s8, s8, 8
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
 ; GCN-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GCN-NEXT:    s_mov_b32 s14, s16
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GCN-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_indirect_call_sgpr_ptr:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_mov_b32 s32, 0
-; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GISEL-NEXT:    s_add_i32 s12, s12, s17
 ; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GISEL-NEXT:    s_add_u32 s0, s0, s17
 ; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GISEL-NEXT:    s_mov_b32 s13, s15
 ; GISEL-NEXT:    s_mov_b32 s12, s14
 ; GISEL-NEXT:    s_getpc_b64 s[14:15]
 ; GISEL-NEXT:    s_add_u32 s14, s14, gv.fptr0 at rel32@lo+4
 ; GISEL-NEXT:    s_addc_u32 s15, s15, gv.fptr0 at rel32@hi+12
-; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GISEL-NEXT:    s_load_dwordx2 s[18:19], s[14:15], 0x0
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GISEL-NEXT:    s_add_u32 s8, s8, 8
-; GISEL-NEXT:    s_addc_u32 s9, s9, 0
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 20, v2
+; GISEL-NEXT:    s_addc_u32 s9, s9, 0
 ; GISEL-NEXT:    v_or_b32_e32 v31, v0, v1
 ; GISEL-NEXT:    s_mov_b32 s14, s16
+; GISEL-NEXT:    s_mov_b32 s32, 0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GISEL-NEXT:    s_endpgm
@@ -63,52 +63,52 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) #0 {
 define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) #0 {
 ; GCN-LABEL: test_indirect_call_sgpr_ptr_arg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GCN-NEXT:    s_add_u32 s0, s0, s17
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    s_mov_b32 s13, s15
 ; GCN-NEXT:    s_mov_b32 s12, s14
 ; GCN-NEXT:    s_getpc_b64 s[14:15]
 ; GCN-NEXT:    s_add_u32 s14, s14, gv.fptr1 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s15, s15, gv.fptr1 at rel32@hi+12
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GCN-NEXT:    s_load_dwordx2 s[18:19], s[14:15], 0x0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GCN-NEXT:    s_add_u32 s8, s8, 8
-; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
 ; GCN-NEXT:    v_or_b32_e32 v31, v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GCN-NEXT:    s_mov_b32 s14, s16
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GCN-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_indirect_call_sgpr_ptr_arg:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_mov_b32 s32, 0
-; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GISEL-NEXT:    s_add_i32 s12, s12, s17
 ; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GISEL-NEXT:    s_add_u32 s0, s0, s17
 ; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GISEL-NEXT:    s_mov_b32 s13, s15
 ; GISEL-NEXT:    s_mov_b32 s12, s14
 ; GISEL-NEXT:    s_getpc_b64 s[14:15]
 ; GISEL-NEXT:    s_add_u32 s14, s14, gv.fptr1 at rel32@lo+4
 ; GISEL-NEXT:    s_addc_u32 s15, s15, gv.fptr1 at rel32@hi+12
-; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GISEL-NEXT:    s_load_dwordx2 s[18:19], s[14:15], 0x0
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GISEL-NEXT:    s_add_u32 s8, s8, 8
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 20, v2
 ; GISEL-NEXT:    s_addc_u32 s9, s9, 0
-; GISEL-NEXT:    v_or_b32_e32 v31, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v31, v0, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GISEL-NEXT:    s_mov_b32 s14, s16
+; GISEL-NEXT:    s_mov_b32 s32, 0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GISEL-NEXT:    s_endpgm
@@ -524,8 +524,8 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) #0 {
 ; GCN-NEXT:    s_cbranch_execnz .LBB4_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[54:55]
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 16
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 17
 ; GCN-NEXT:    v_readlane_b32 s65, v40, 15
 ; GCN-NEXT:    v_readlane_b32 s64, v40, 14
@@ -610,8 +610,8 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) #0 {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB4_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[54:55]
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 16
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v1
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 17
 ; GISEL-NEXT:    v_readlane_b32 s65, v40, 15
 ; GISEL-NEXT:    v_readlane_b32 s64, v40, 14
@@ -673,6 +673,7 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) #0 {
 ; GCN-NEXT:    v_writelane_b32 v40, s67, 17
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 18
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 19
+; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GCN-NEXT:    s_mov_b32 s50, s15
 ; GCN-NEXT:    s_mov_b32 s51, s14
 ; GCN-NEXT:    s_mov_b32 s52, s13
@@ -681,7 +682,6 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) #0 {
 ; GCN-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[48:49], s[4:5]
-; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GCN-NEXT:    s_and_saveexec_b64 s[54:55], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB5_4
@@ -768,6 +768,7 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) #0 {
 ; GISEL-NEXT:    v_writelane_b32 v40, s67, 17
 ; GISEL-NEXT:    v_writelane_b32 v40, s30, 18
 ; GISEL-NEXT:    v_writelane_b32 v40, s31, 19
+; GISEL-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GISEL-NEXT:    s_mov_b32 s50, s15
 ; GISEL-NEXT:    s_mov_b32 s51, s14
 ; GISEL-NEXT:    s_mov_b32 s52, s13
@@ -776,7 +777,6 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) #0 {
 ; GISEL-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GISEL-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; GISEL-NEXT:    s_mov_b64 s[48:49], s[4:5]
-; GISEL-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GISEL-NEXT:    s_and_saveexec_b64 s[54:55], vcc
 ; GISEL-NEXT:    s_cbranch_execz .LBB5_4
@@ -1015,6 +1015,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) #0 {
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v40
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    v_readlane_b32 s30, v41, 14
 ; GCN-NEXT:    v_readlane_b32 s31, v41, 15
 ; GCN-NEXT:    v_readlane_b32 s55, v41, 13
@@ -1031,7 +1032,6 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) #0 {
 ; GCN-NEXT:    v_readlane_b32 s36, v41, 2
 ; GCN-NEXT:    v_readlane_b32 s35, v41, 1
 ; GCN-NEXT:    v_readlane_b32 s34, v41, 0
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -1081,6 +1081,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) #0 {
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v40
+; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    v_readlane_b32 s30, v41, 14
 ; GISEL-NEXT:    v_readlane_b32 s31, v41, 15
 ; GISEL-NEXT:    v_readlane_b32 s55, v41, 13
@@ -1097,7 +1098,6 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) #0 {
 ; GISEL-NEXT:    v_readlane_b32 s36, v41, 2
 ; GISEL-NEXT:    v_readlane_b32 s35, v41, 1
 ; GISEL-NEXT:    v_readlane_b32 s34, v41, 0
-; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -1153,8 +1153,8 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) #0 {
 ; GCN-NEXT:    s_cbranch_execnz .LBB8_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 14
+; GCN-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 15
 ; GCN-NEXT:    v_readlane_b32 s55, v40, 13
 ; GCN-NEXT:    v_readlane_b32 s54, v40, 12
@@ -1217,8 +1217,8 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) #0 {
 ; GISEL-NEXT:    s_cbranch_execnz .LBB8_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v0, v2
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 14
+; GISEL-NEXT:    v_mov_b32_e32 v0, v2
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 15
 ; GISEL-NEXT:    v_readlane_b32 s55, v40, 13
 ; GISEL-NEXT:    v_readlane_b32 s54, v40, 12
diff --git a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
index 5dff7372ab561..636270d8fa366 100644
--- a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsad < %s | FileCheck %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsad -mcpu=gfx700 < %s | FileCheck %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck %s
 
 ; Check illegal casts are codegened as poison, and not an error.
 
 define amdgpu_kernel void @use_group_to_global_addrspacecast(ptr addrspace(3) %ptr) {
 ; CHECK-LABEL: use_group_to_global_addrspacecast:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CHECK-NEXT:    s_add_i32 s12, s12, s17
+; CHECK-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CHECK-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    flat_store_dword v[0:1], v0
diff --git a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
index 283ebc30bf939..1a9939bd4f2bc 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra -print-regusage -filetype=null 2>&1 < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-new-pm=1 -enable-ipra -print-regusage -filetype=null 2>&1 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-ipra -print-regusage -filetype=null 2>&1 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-new-pm=1 -enable-ipra -print-regusage -filetype=null 2>&1 < %s | FileCheck %s
 
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -stop-after=prologepilog -o - %s \
-; RUN:   | llc -x=mir -mtriple=amdgcn-amd-amdhsa -passes="module(require<reg-usage>,function(machine-function(reg-usage-collector)),print<reg-usage>)" -filetype=null 2>&1 \
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -stop-after=prologepilog -o - %s \
+; RUN:   | llc -x=mir -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -passes="module(require<reg-usage>,function(machine-function(reg-usage-collector)),print<reg-usage>)" -filetype=null 2>&1 \
 ; RUN:   | FileCheck %s
 
 ; Make sure the expected regmask is generated for sub/superregisters.
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index ea0a8cbe663fd..6fdc636020df5 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn | FileCheck -check-prefixes=SI %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx600 | FileCheck -check-prefixes=SI %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=VI %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck -check-prefixes=EGCM,EG %s
@@ -8,12 +8,12 @@
 define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind {
 ; SI-LABEL: i8_arg:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s2, 0xff
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s4, s6, 0xff
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -80,12 +80,12 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw
 define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind {
 ; SI-LABEL: i8_zext_arg:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s2, 0xff
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s4, s6, 0xff
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -155,12 +155,12 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe
 define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind {
 ; SI-LABEL: i8_sext_arg:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_sext_i32_i8 s4, s2
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_sext_i32_i8 s4, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -230,12 +230,12 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe
 define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind {
 ; SI-LABEL: i16_arg:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s2, 0xffff
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s4, s6, 0xffff
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -302,12 +302,12 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou
 define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind {
 ; SI-LABEL: i16_zext_arg:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s2, 0xffff
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s4, s6, 0xffff
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -377,12 +377,12 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer
 define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind {
 ; SI-LABEL: i16_sext_arg:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_sext_i32_i16 s4, s2
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_sext_i32_i16 s4, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -722,9 +722,9 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32>
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -785,9 +785,9 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -847,11 +847,11 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s4, s6, 16
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -1107,10 +1107,10 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32>
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0 offset:8
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -1182,10 +1182,10 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0 offset:8
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -1315,9 +1315,9 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -1663,8 +1663,8 @@ entry:
 define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> %in) nounwind {
 ; SI-LABEL: v5i16_arg:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xf
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
@@ -2358,9 +2358,9 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -4459,18 +4459,19 @@ entry:
 define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind {
 ; SI-LABEL: i65_arg:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s8, s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s8, s6, 1
 ; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_and_b32 s0, s8, 1
 ; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:8
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:8
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -4577,12 +4578,12 @@ entry:
 define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
 ; SI-LABEL: i1_arg:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s2, 1
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s4, s6, 1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -4668,12 +4669,12 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
 define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind {
 ; SI-LABEL: i1_arg_zext_i32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s2, 1
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s4, s6, 1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -4744,9 +4745,9 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_and_b32 s4, s6, 1
-; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -4816,12 +4817,12 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin
 define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind {
 ; SI-LABEL: i1_arg_sext_i32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_i32 s4, s2, 0x10000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bfe_i32 s4, s6, 0x10000
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -4890,12 +4891,12 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin
 define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind {
 ; SI-LABEL: i1_arg_sext_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_i64 s[4:5], s[2:3], 0x10000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bfe_i64 s[4:5], s[6:7], 0x10000
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -5134,26 +5135,26 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    buffer_load_ubyte v4, off, s[4:7], 0 offset:50
+; SI-NEXT:    buffer_load_ubyte v5, off, s[4:7], 0 offset:52
+; SI-NEXT:    buffer_load_ubyte v6, off, s[4:7], 0 offset:49
+; SI-NEXT:    buffer_load_ubyte v7, off, s[4:7], 0 offset:51
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
 ; SI-NEXT:    s_load_dword s2, s[4:5], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa
-; SI-NEXT:    buffer_load_ubyte v4, off, s[4:7], 0 offset:49
-; SI-NEXT:    buffer_load_ubyte v5, off, s[4:7], 0 offset:50
-; SI-NEXT:    buffer_load_ubyte v6, off, s[4:7], 0 offset:51
-; SI-NEXT:    buffer_load_ubyte v7, off, s[4:7], 0 offset:52
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v8, s2
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
-; SI-NEXT:    v_or_b32_e32 v2, v2, v4
-; SI-NEXT:    v_or_b32_e32 v3, v3, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
+; SI-NEXT:    v_or_b32_e32 v2, v2, v6
+; SI-NEXT:    v_or_b32_e32 v3, v3, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
@@ -5484,11 +5485,11 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
@@ -5593,12 +5594,12 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
 ; SI-LABEL: array_3xi16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s0, s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0 offset:42
 ; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:40
 ; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 offset:38
+; SI-NEXT:    s_load_dword s0, s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v3, s0
 ; SI-NEXT:    buffer_store_byte v3, off, s[4:7], 0
@@ -5826,10 +5827,10 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s7
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
index fdcff3b19f758..77c14cfcc732a 100644
--- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope %s
 
 ; Although it's modeled without any control flow in order to get better code
 ; out of the structurizer, @llvm.amdgcn.kill actually ends the thread that calls
@@ -13,8 +13,8 @@
 define amdgpu_ps void @return_void(float %0) #0 {
 ; CHECK-LABEL: return_void:
 ; CHECK:       ; %bb.0: ; %main_body
-; CHECK-NEXT:    s_mov_b64 s[0:1], exec
 ; CHECK-NEXT:    s_mov_b32 s2, 0x41200000
+; CHECK-NEXT:    s_mov_b64 s[0:1], exec
 ; CHECK-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v0
 ; CHECK-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; CHECK-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -26,7 +26,6 @@ define amdgpu_ps void @return_void(float %0) #0 {
 ; CHECK-NEXT:  ; %bb.2: ; %loop
 ; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    s_mov_b64 exec, 0
-; CHECK-NEXT:    s_mov_b64 vcc, 0
 ; CHECK-NEXT:    s_branch .LBB0_1
 ; CHECK-NEXT:  .LBB0_3: ; %Flow1
 ; CHECK-NEXT:    s_andn2_saveexec_b64 s[0:1], s[2:3]
@@ -57,8 +56,8 @@ end:
 define amdgpu_ps void @return_void_compr(float %0) #0 {
 ; CHECK-LABEL: return_void_compr:
 ; CHECK:       ; %bb.0: ; %main_body
-; CHECK-NEXT:    s_mov_b64 s[0:1], exec
 ; CHECK-NEXT:    s_mov_b32 s2, 0x41200000
+; CHECK-NEXT:    s_mov_b64 s[0:1], exec
 ; CHECK-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v0
 ; CHECK-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; CHECK-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -70,7 +69,6 @@ define amdgpu_ps void @return_void_compr(float %0) #0 {
 ; CHECK-NEXT:  ; %bb.2: ; %loop
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    s_mov_b64 exec, 0
-; CHECK-NEXT:    s_mov_b64 vcc, 0
 ; CHECK-NEXT:    s_branch .LBB1_1
 ; CHECK-NEXT:  .LBB1_3: ; %Flow1
 ; CHECK-NEXT:    s_andn2_saveexec_b64 s[0:1], s[2:3]
@@ -109,7 +107,6 @@ define amdgpu_ps void @only_kill() #0 {
 ; CHECK-NEXT:  ; %bb.2: ; %loop
 ; CHECK-NEXT:    ; in Loop: Header=BB2_1 Depth=1
 ; CHECK-NEXT:    s_mov_b64 exec, 0
-; CHECK-NEXT:    s_mov_b64 vcc, exec
 ; CHECK-NEXT:    s_cbranch_execnz .LBB2_1
 ; CHECK-NEXT:  ; %bb.3: ; %DummyReturnBlock
 ; CHECK-NEXT:    s_endpgm
@@ -129,8 +126,8 @@ loop:
 define amdgpu_ps float @return_nonvoid(float %0) #0 {
 ; CHECK-LABEL: return_nonvoid:
 ; CHECK:       ; %bb.0: ; %main_body
-; CHECK-NEXT:    s_mov_b64 s[0:1], exec
 ; CHECK-NEXT:    s_mov_b32 s2, 0x41200000
+; CHECK-NEXT:    s_mov_b64 s[0:1], exec
 ; CHECK-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v0
 ; CHECK-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; CHECK-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -142,7 +139,6 @@ define amdgpu_ps float @return_nonvoid(float %0) #0 {
 ; CHECK-NEXT:  ; %bb.2: ; %loop
 ; CHECK-NEXT:    ; in Loop: Header=BB3_1 Depth=1
 ; CHECK-NEXT:    s_mov_b64 exec, 0
-; CHECK-NEXT:    s_mov_b64 vcc, exec
 ; CHECK-NEXT:    s_cbranch_execnz .LBB3_1
 ; CHECK-NEXT:  .LBB3_3: ; %Flow1
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/livevars-implicitdef.mir b/llvm/test/CodeGen/AMDGPU/livevars-implicitdef.mir
index 18aeb2527b1a3..ed78b6da798a1 100644
--- a/llvm/test/CodeGen/AMDGPU/livevars-implicitdef.mir
+++ b/llvm/test/CodeGen/AMDGPU/livevars-implicitdef.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn --run-pass=livevars -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 --run-pass=livevars -o - %s | FileCheck %s
 ---
 # Check that super register is defined for an sgpr copy.
 name:            sgpr_copy
@@ -12,7 +12,7 @@ body:             |
     ; CHECK-NEXT: $sgpr0 = COPY %sval
     ; CHECK-NEXT: $sgpr1 = COPY %sval
     ; CHECK-NEXT: $sgpr2 = COPY %sval
-    ; CHECK-NEXT: $sgpr3 = COPY killed %sval
+    ; CHECK-NEXT: $sgpr3 = COPY killed %sval, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
     ; CHECK-NEXT: SI_RETURN implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
     %sval:sreg_32 = S_MOV_B32 0
 
@@ -35,7 +35,7 @@ body:             |
     ; CHECK-NEXT: $vgpr0 = COPY %vval
     ; CHECK-NEXT: $vgpr1 = COPY %vval
     ; CHECK-NEXT: $vgpr2 = COPY %vval
-    ; CHECK-NEXT: $vgpr3 = COPY killed %vval
+    ; CHECK-NEXT: $vgpr3 = COPY killed %vval, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
     ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3
     %vval:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
 
@@ -56,7 +56,7 @@ body:             |
     ; CHECK: %sval:sreg_32 = S_MOV_B32 0
     ; CHECK-NEXT: $sgpr0 = COPY %sval
     ; CHECK-NEXT: $sgpr2 = COPY %sval
-    ; CHECK-NEXT: $sgpr3 = COPY killed %sval
+    ; CHECK-NEXT: $sgpr3 = COPY killed %sval, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
     ; CHECK-NEXT: SI_RETURN implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
     %sval:sreg_32 = S_MOV_B32 0
 
@@ -76,8 +76,8 @@ body:             |
     ; CHECK: %vval:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; CHECK-NEXT: $vgpr0 = COPY %vval
     ; CHECK-NEXT: $vgpr1 = COPY %vval
-    ; CHECK-NEXT: $vgpr2 = COPY %vval
-    ; CHECK-NEXT: $vgpr3 = COPY killed %vval
+    ; CHECK-NEXT: $vgpr2 = COPY %vval, implicit-def $vgpr1_vgpr2
+    ; CHECK-NEXT: $vgpr3 = COPY killed %vval, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
     ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: dead [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3
     %vval:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll
index 3dd9252d8e96a..96a04ff783ab2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
@@ -79,23 +79,24 @@ define amdgpu_kernel void @v_alignbyte_b32_2(ptr addrspace(1) %out, ptr addrspac
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GCN-NEXT:    s_load_dword s16, s[4:5], 0xf
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s14, 0
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_mov_b32 s15, s7
-; GCN-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0xf
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_alignbyte_b32 v0, v2, v0, s16
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_alignbyte_b32 v0, v2, v0, s2
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
index 64fadb8d05eac..e3c29d3b0d943 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN:  llc -global-isel=0 -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-SDAG %s
-; RUN:  llc -global-isel=1 -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-GISEL %s
+; RUN:  llc -global-isel=0 -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-SDAG %s
+; RUN:  llc -global-isel=1 -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-GISEL %s
 
 declare i1 @llvm.amdgcn.class.f32(float, i32) #1
 declare i1 @llvm.amdgcn.class.f64(double, i32) #1
@@ -33,9 +33,9 @@ define amdgpu_kernel void @test_class_f32(ptr addrspace(1) %out, [8 x i32], floa
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, s6, v0
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], vcc, vcc
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
@@ -69,9 +69,9 @@ define amdgpu_kernel void @test_class_fabs_f32(ptr addrspace(1) %out, [8 x i32],
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], |s6|, v0
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[4:5]
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
@@ -106,9 +106,9 @@ define amdgpu_kernel void @test_class_fneg_f32(ptr addrspace(1) %out, [8 x i32],
 ; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -1.0, s3
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, s6
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[4:5]
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %a.fneg = fsub float -0.0, %a
@@ -143,9 +143,9 @@ define amdgpu_kernel void @test_class_fneg_fabs_f32(ptr addrspace(1) %out, [8 x
 ; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -1.0, |s3|
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, s6
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[4:5]
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
@@ -177,9 +177,9 @@ define amdgpu_kernel void @test_class_1_f32(ptr addrspace(1) %out, float %a) #0
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], s3, 1
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[4:5]
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
@@ -209,9 +209,9 @@ define amdgpu_kernel void @test_class_64_f32(ptr addrspace(1) %out, float %a) #0
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], s3, 64
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[4:5]
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
@@ -224,13 +224,13 @@ define amdgpu_kernel void @test_class_64_f32(ptr addrspace(1) %out, float %a) #0
 define amdgpu_kernel void @test_class_full_mask_f32(ptr addrspace(1) %out, float %a) #0 {
 ; SI-SDAG-LABEL: test_class_full_mask_f32:
 ; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-SDAG-NEXT:    s_load_dword s4, s[4:5], 0xb
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3ff
 ; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-SDAG-NEXT:    s_mov_b32 s2, -1
-; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3ff
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
+; SI-SDAG-NEXT:    v_cmp_class_f32_e32 vcc, s6, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-SDAG-NEXT:    s_endpgm
@@ -244,9 +244,9 @@ define amdgpu_kernel void @test_class_full_mask_f32(ptr addrspace(1) %out, float
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, s3, v0
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], vcc, vcc
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1
@@ -258,13 +258,13 @@ define amdgpu_kernel void @test_class_full_mask_f32(ptr addrspace(1) %out, float
 define amdgpu_kernel void @test_class_9bit_mask_f32(ptr addrspace(1) %out, float %a) #0 {
 ; SI-SDAG-LABEL: test_class_9bit_mask_f32:
 ; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-SDAG-NEXT:    s_load_dword s4, s[4:5], 0xb
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x1ff
 ; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-SDAG-NEXT:    s_mov_b32 s2, -1
-; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x1ff
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
+; SI-SDAG-NEXT:    v_cmp_class_f32_e32 vcc, s6, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-SDAG-NEXT:    s_endpgm
@@ -278,9 +278,9 @@ define amdgpu_kernel void @test_class_9bit_mask_f32(ptr addrspace(1) %out, float
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, s3, v0
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], vcc, vcc
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
@@ -319,10 +319,10 @@ define amdgpu_kernel void @v_test_class_full_mask_f32(ptr addrspace(1) %out, ptr
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x1ff
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v2, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -364,10 +364,10 @@ define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(ptr a
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, 1.0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -412,10 +412,10 @@ define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(ptr addrspac
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x44800000
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v3, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -446,17 +446,17 @@ define amdgpu_kernel void @test_class_f64(ptr addrspace(1) %out, [8 x i32], doub
 ;
 ; SI-GISEL-LABEL: test_class_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_load_dword s3, s[4:5], 0x1d
-; SI-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SI-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x1d
+; SI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x13
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, s[6:7], v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, s[2:3], v0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], vcc, vcc
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
@@ -482,17 +482,17 @@ define amdgpu_kernel void @test_class_fabs_f64(ptr addrspace(1) %out, [8 x i32],
 ;
 ; SI-GISEL-LABEL: test_class_fabs_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_load_dword s3, s[4:5], 0x1d
-; SI-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SI-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x1d
+; SI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x13
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
-; SI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], |s[6:7]|, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; SI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], |s[2:3]|, v0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[4:5]
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
@@ -519,17 +519,18 @@ define amdgpu_kernel void @test_class_fneg_f64(ptr addrspace(1) %out, [8 x i32],
 ;
 ; SI-GISEL-LABEL: test_class_fneg_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
-; SI-GISEL-NEXT:    s_load_dword s3, s[4:5], 0x1d
+; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x13
+; SI-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x1d
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_max_f64 v[0:1], -s[0:1], -s[0:1]
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], s2
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
-; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_max_f64 v[0:1], -s[6:7], -s[6:7]
-; SI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], s3
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[4:5]
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %a.fneg = fsub double -0.0, %a
@@ -556,17 +557,18 @@ define amdgpu_kernel void @test_class_fneg_fabs_f64(ptr addrspace(1) %out, [8 x
 ;
 ; SI-GISEL-LABEL: test_class_fneg_fabs_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
-; SI-GISEL-NEXT:    s_load_dword s3, s[4:5], 0x1d
+; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x13
+; SI-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x1d
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_max_f64 v[0:1], -|s[0:1]|, -|s[0:1]|
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], s2
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
-; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_max_f64 v[0:1], -|s[6:7]|, -|s[6:7]|
-; SI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], s3
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[4:5]
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
@@ -598,9 +600,9 @@ define amdgpu_kernel void @test_class_1_f64(ptr addrspace(1) %out, double %a) #0
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], s[2:3], 1
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[4:5]
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1
@@ -630,9 +632,9 @@ define amdgpu_kernel void @test_class_64_f64(ptr addrspace(1) %out, double %a) #
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], s[2:3], 64
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[4:5]
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1
@@ -645,29 +647,29 @@ define amdgpu_kernel void @test_class_64_f64(ptr addrspace(1) %out, double %a) #
 define amdgpu_kernel void @test_class_full_mask_f64(ptr addrspace(1) %out, [8 x i32], double %a) #0 {
 ; SI-SDAG-LABEL: test_class_full_mask_f64:
 ; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
 ; SI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x13
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x1ff
 ; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-SDAG-NEXT:    s_mov_b32 s2, -1
-; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x1ff
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v0
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, s[6:7], v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
 ; SI-GISEL-LABEL: test_class_full_mask_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x13
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1ff
-; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, s[6:7], v0
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, s[2:3], v0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], vcc, vcc
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
@@ -682,36 +684,36 @@ define amdgpu_kernel void @v_test_class_full_mask_f64(ptr addrspace(1) %out, ptr
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-SDAG-NEXT:    s_mov_b32 s6, -1
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, s2
 ; SI-SDAG-NEXT:    s_mov_b32 s5, s3
-; SI-SDAG-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    buffer_load_dwordx2 v[1:2], off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_movk_i32 s4, 0x1ff
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s3, s7
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], s4
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
-; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[1:2], s4
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; SI-SDAG-NEXT:    buffer_store_dword v0, v[3:4], s[0:3], 0 addr64
 ; SI-SDAG-NEXT:    s_endpgm
 ;
 ; SI-GISEL-LABEL: v_test_class_full_mask_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x1ff
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x1ff
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0
+; SI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v2
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, s[2:3], v2
+; SI-GISEL-NEXT:    s_mov_b32 s2, 0
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], vcc, vcc
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -753,10 +755,10 @@ define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(ptr a
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, 1.0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -802,10 +804,10 @@ define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(ptr addrspac
 ; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40900000
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v4
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -825,15 +827,15 @@ define amdgpu_kernel void @test_fold_or_class_f32_0(ptr addrspace(1) %out, ptr a
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-SDAG-NEXT:    s_mov_b32 s10, 0
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s11, s7
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s4, s0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, s1
+; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, 3
 ; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
@@ -879,15 +881,15 @@ define amdgpu_kernel void @test_fold_or3_class_f32_0(ptr addrspace(1) %out, ptr
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-SDAG-NEXT:    s_mov_b32 s10, 0
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s11, s7
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s4, s0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, s1
+; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, 7
 ; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
@@ -937,16 +939,16 @@ define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(ptr addrspace(1) %
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-SDAG-NEXT:    s_mov_b32 s10, 0
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s11, s7
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_movk_i32 s2, 0x3ff
 ; SI-SDAG-NEXT:    s_mov_b32 s4, s0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, s1
+; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, s2
 ; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
@@ -960,31 +962,31 @@ define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(ptr addrspace(1) %
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-GISEL-NEXT:    s_mov_b32 s10, 0
 ; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x80
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; SI-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x100
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x200
 ; SI-GISEL-NEXT:    s_mov_b32 s10, -1
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[6:7], v0, 1
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[8:9], v0, 2
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[12:13], v0, 4
-; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[14:15], v0, 8
-; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[16:17], v0, 16
-; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[18:19], v0, 32
-; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[20:21], v0, 64
-; SI-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, v1
-; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v3
 ; SI-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[14:15], v0, 8
 ; SI-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
+; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[16:17], v0, 16
 ; SI-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[14:15]
+; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[18:19], v0, 32
 ; SI-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[16:17]
+; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[20:21], v0, 64
 ; SI-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
+; SI-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; SI-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[20:21]
+; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, v2
 ; SI-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
+; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v3
 ; SI-GISEL-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
 ; SI-GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
@@ -1026,15 +1028,15 @@ define amdgpu_kernel void @test_fold_or_class_f32_1(ptr addrspace(1) %out, ptr a
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-SDAG-NEXT:    s_mov_b32 s10, 0
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s11, s7
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s4, s0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, s1
+; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, 12
 ; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
@@ -1080,15 +1082,15 @@ define amdgpu_kernel void @test_fold_or_class_f32_2(ptr addrspace(1) %out, ptr a
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-SDAG-NEXT:    s_mov_b32 s10, 0
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s11, s7
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s4, s0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, s1
+; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, 7
 ; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
@@ -1133,16 +1135,16 @@ define amdgpu_kernel void @test_no_fold_or_class_f32_0(ptr addrspace(1) %out, pt
 ; SI-SDAG-NEXT:    s_load_dword s12, s[4:5], 0xd
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-SDAG-NEXT:    s_mov_b32 s10, 0
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s11, s7
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s4, s0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, s1
 ; SI-SDAG-NEXT:    v_cmp_class_f32_e64 s[0:1], s12, 8
+; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, 4
 ; SI-SDAG-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
@@ -1162,9 +1164,9 @@ define amdgpu_kernel void @test_no_fold_or_class_f32_0(ptr addrspace(1) %out, pt
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[2:3], s8, 8
-; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_or_b64 s[2:3], s[2:3], s[2:3]
 ; SI-GISEL-NEXT:    s_cselect_b64 s[2:3], exec, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 4
 ; SI-GISEL-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
@@ -1205,9 +1207,9 @@ define amdgpu_kernel void @test_class_0_f32(ptr addrspace(1) %out, float %a) #0
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], s3, 0
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[4:5]
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1
@@ -1234,9 +1236,9 @@ define amdgpu_kernel void @test_class_0_f64(ptr addrspace(1) %out, double %a) #0
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], s[2:3], 0
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[4:5]
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1
@@ -1265,9 +1267,9 @@ define amdgpu_kernel void @test_class_undef_f32(ptr addrspace(1) %out, float %a,
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, s0, v0
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], vcc, vcc
-; SI-GISEL-NEXT:    s_cselect_b32 s4, -1, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s3, -1, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
   %result = call i1 @llvm.amdgcn.class.f32(float poison, i32 %b) #1
@@ -1329,10 +1331,11 @@ define i1 @test_fold_and_ord_multi_use(float %a) {
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; SI-SDAG-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; SI-SDAG-NEXT:    buffer_store_byte v1, off, s[4:7], 0
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-SDAG-NEXT:    s_waitcnt expcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: test_fold_and_ord_multi_use:
@@ -1343,10 +1346,11 @@ define i1 @test_fold_and_ord_multi_use(float %a) {
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; SI-GISEL-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; SI-GISEL-NEXT:    buffer_store_byte v1, off, s[4:7], 0
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-GISEL-NEXT:    s_waitcnt expcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
   store volatile i1 %class, ptr addrspace(1) poison
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 18afb7c677207..272b0a2a84938 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
 ; XUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
 ; FIXME: Enable for VI.
@@ -13,17 +13,17 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], f
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x13
-; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2e
-; GCN-NEXT:    s_load_dword s7, s[4:5], 0x1c
+; GCN-NEXT:    s_load_dword s7, s[4:5], 0x2e
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0x1c
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x25
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bitcmp1_b32 s2, 0
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_bitcmp1_b32 s7, 0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, s8
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -36,15 +36,15 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out,
 ; GCN-LABEL: test_div_fmas_f32_inline_imm_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2e
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x1c
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2e
+; GCN-NEXT:    s_load_dword s7, s[4:5], 0x1c
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x25
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_bitcmp1_b32 s6, 0
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    v_div_fmas_f32 v0, 1.0, v0, v1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -58,15 +58,15 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out,
 ; GCN-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_load_dword s2, s[4:5], 0x16
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x16
+; GCN-NEXT:    s_load_dword s7, s[4:5], 0xb
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0xd
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_bitcmp1_b32 s6, 0
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    v_div_fmas_f32 v0, v0, 1.0, v1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -80,15 +80,15 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out,
 ; GCN-LABEL: test_div_fmas_f32_inline_imm_2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2e
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x13
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2e
+; GCN-NEXT:    s_load_dword s7, s[4:5], 0x13
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x1c
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_bitcmp1_b32 s6, 0
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    v_div_fmas_f32 v0, v0, v1, 1.0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -101,23 +101,23 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) nounwind {
 ; GCN-LABEL: test_div_fmas_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s8, s[4:5], 0x11
-; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_load_dword s0, s[4:5], 0x11
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bitcmp1_b32 s8, 0
+; GCN-NEXT:    s_bitcmp1_b32 s0, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-NEXT:    v_mov_b32_e32 v2, s12
+; GCN-NEXT:    v_mov_b32_e32 v3, s13
+; GCN-NEXT:    v_mov_b32_e32 v4, s14
+; GCN-NEXT:    v_mov_b32_e32 v5, s15
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_mov_b32 s10, -1
-; GCN-NEXT:    s_mov_b32 s8, s0
-; GCN-NEXT:    s_mov_b32 s9, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-NEXT:    v_mov_b32_e32 v4, s6
-; GCN-NEXT:    v_mov_b32_e32 v5, s7
 ; GCN-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    s_mov_b32 s0, s8
+; GCN-NEXT:    s_mov_b32 s1, s9
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
   store double %result, ptr addrspace(1) %out, align 8
@@ -130,11 +130,11 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out,
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s3, 0
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
@@ -151,13 +151,13 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b64 vcc, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    s_mov_b64 vcc, 0
 ; GCN-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -171,13 +171,13 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b64 vcc, -1
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    s_mov_b64 vcc, -1
 ; GCN-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -194,22 +194,22 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; GCN-NEXT:    buffer_load_dword v0, v[1:2], s[4:7], 0 addr64 glc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 offset:4 glc
+; GCN-NEXT:    buffer_load_dword v4, v[1:2], s[4:7], 0 addr64 offset:4 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 offset:8 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN-NEXT:    s_and_b64 vcc, vcc, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_and_b64 vcc, vcc, s[4:5]
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, s7
-; GCN-NEXT:    v_div_fmas_f32 v0, v0, v3, v1
+; GCN-NEXT:    v_div_fmas_f32 v0, v3, v4, v1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
 ; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -235,7 +235,6 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p
 ; GCN-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
@@ -244,6 +243,7 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p
 ; GCN-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; GCN-NEXT:    buffer_load_dwordx2 v[1:2], v[3:4], s[0:3], 0 addr64
 ; GCN-NEXT:    buffer_load_dword v3, v[3:4], s[0:3], 0 addr64 offset:8
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
 ; GCN-NEXT:    s_mov_b64 vcc, 0
 ; GCN-NEXT:    s_and_saveexec_b64 s[10:11], s[0:1]
@@ -251,6 +251,7 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p
 ; GCN-NEXT:  ; %bb.1: ; %bb
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_mov_b32 s7, s3
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 3dc6c5567efa0..c20a44822aee4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GCN %s
 
 declare i32 @llvm.amdgcn.lds.kernel.id()
 declare i32 @llvm.amdgcn.workgroup.id.x()
@@ -23,15 +23,15 @@ define void @function_lds_id(ptr addrspace(1) %out) {
 define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
 ; GCN-LABEL: kernel_lds_id:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT:    s_add_i32 s12, s12, s17
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_add_i32 s2, s14, 42
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
   %tmp0 = call i32 @llvm.amdgcn.lds.kernel.id()
@@ -44,30 +44,30 @@ define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds
 define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !1 {
 ; GCN-LABEL: indirect_lds_id:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GCN-NEXT:    s_add_u32 s0, s0, s17
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_mov_b32 s13, s15
-; GCN-NEXT:    s_mov_b32 s12, s14
 ; GCN-NEXT:    s_load_dwordx2 s[18:19], s[8:9], 0x0
 ; GCN-NEXT:    s_add_u32 s8, s8, 8
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT:    s_mov_b32 s13, s15
+; GCN-NEXT:    s_mov_b32 s12, s14
 ; GCN-NEXT:    s_getpc_b64 s[14:15]
 ; GCN-NEXT:    s_add_u32 s14, s14, function_lds_id at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s15, s15, function_lds_id at gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[20:21], s[14:15], 0x0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GCN-NEXT:    v_or_b32_e32 v31, v0, v2
-; GCN-NEXT:    s_mov_b32 s15, 21
 ; GCN-NEXT:    s_mov_b32 s14, s16
+; GCN-NEXT:    s_mov_b32 s15, 21
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s18
 ; GCN-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; GCN-NEXT:    s_endpgm
   call void @function_lds_id(ptr addrspace(1) %out)
@@ -77,10 +77,10 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
 define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
 ; GCN-LABEL: doesnt_use_it:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x64
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
index 69e7ac4317e30..58d8ea52527b6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -global-isel=1 < %s | FileCheck -check-prefixes=SI-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -global-isel=1 < %s | FileCheck -check-prefixes=SI-GISEL %s
 ; RUN: llc -mtriple=amdgcn -global-isel=1 -mcpu=tonga < %s | FileCheck -check-prefixes=VI-GISEL %s
 
 declare float @llvm.amdgcn.rsq.clamp.f32(float) #1
@@ -10,12 +10,12 @@ declare double @llvm.amdgcn.rsq.clamp.f64(double) #1
 define amdgpu_kernel void @rsq_clamp_f32(ptr addrspace(1) %out, float %src) #0 {
 ; SI-LABEL: rsq_clamp_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_rsq_clamp_f32_e32 v0, s6
+; SI-NEXT:    v_rsq_clamp_f32_e32 v0, s2
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -35,12 +35,12 @@ define amdgpu_kernel void @rsq_clamp_f32(ptr addrspace(1) %out, float %src) #0 {
 ;
 ; SI-GISEL-LABEL: rsq_clamp_f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_load_dword s3, s[4:5], 0xb
+; SI-GISEL-NEXT:    s_load_dword s2, s[4:5], 0xb
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s2, -1
-; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_rsq_clamp_f32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_rsq_clamp_f32_e32 v0, s2
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -70,9 +70,9 @@ define amdgpu_kernel void @rsq_clamp_f64(ptr addrspace(1) %out, double %src) #0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_rsq_clamp_f64_e32 v[0:1], s[2:3]
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_rsq_clamp_f64_e32 v[0:1], s[2:3]
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -136,10 +136,10 @@ define amdgpu_kernel void @rsq_clamp_undef_f32(ptr addrspace(1) %out) #0 {
 ; SI-GISEL-LABEL: rsq_clamp_undef_f32:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_rsq_clamp_f32_e32 v0, s0
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_rsq_clamp_f32_e32 v0, s0
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
index 3f4d530838326..2d66657e3c4af 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -global-isel=1 < %s | FileCheck -check-prefixes=SI-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -global-isel=1 < %s | FileCheck -check-prefixes=SI-GISEL %s
 ; RUN: llc -mtriple=amdgcn -global-isel=1 -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-GISEL %s
 ; RUN: llc -mtriple=amdgcn -global-isel=1 -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
 
@@ -11,12 +11,12 @@ declare double @llvm.amdgcn.rsq.f64(double) #0
 define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %out, float %src) #1 {
 ; SI-LABEL: rsq_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_rsq_f32_e32 v0, s6
+; SI-NEXT:    v_rsq_f32_e32 v0, s2
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -33,12 +33,12 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %out, float %src) #1 {
 ;
 ; SI-GISEL-LABEL: rsq_f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_load_dword s3, s[4:5], 0xb
+; SI-GISEL-NEXT:    s_load_dword s2, s[4:5], 0xb
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s2, -1
-; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_rsq_f32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_rsq_f32_e32 v0, s2
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -74,9 +74,9 @@ define amdgpu_kernel void @rsq_f32_constant_4.0(ptr addrspace(1) %out) #1 {
 ; SI-LABEL: rsq_f32_constant_4.0:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    v_rsq_f32_e32 v0, 4.0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_rsq_f32_e32 v0, 4.0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -129,9 +129,9 @@ define amdgpu_kernel void @rsq_f32_constant_100.0(ptr addrspace(1) %out) #1 {
 ; SI-LABEL: rsq_f32_constant_100.0:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    v_rsq_f32_e32 v0, 0x42c80000
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_rsq_f32_e32 v0, 0x42c80000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -187,9 +187,9 @@ define amdgpu_kernel void @rsq_f64(ptr addrspace(1) %out, double %src) #1 {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_rsq_f64_e32 v[0:1], s[2:3]
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_rsq_f64_e32 v[0:1], s[2:3]
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -241,9 +241,9 @@ define amdgpu_kernel void @rsq_f64_constant_4.0(ptr addrspace(1) %out) #1 {
 ; SI-LABEL: rsq_f64_constant_4.0:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    v_rsq_f64_e32 v[0:1], 4.0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_rsq_f64_e32 v[0:1], 4.0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -295,9 +295,9 @@ define amdgpu_kernel void @rsq_f64_constant_100.0(ptr addrspace(1) %out) #1 {
 ; SI-LABEL: rsq_f64_constant_100.0:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    v_rsq_f64_e32 v[0:1], 0x40590000
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_rsq_f64_e32 v[0:1], 0x40590000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -360,10 +360,10 @@ define amdgpu_kernel void @rsq_undef_f32(ptr addrspace(1) %out) #1 {
 ; SI-GISEL-LABEL: rsq_undef_f32:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_rsq_f32_e32 v0, s0
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_rsq_f32_e32 v0, s0
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 28926c63de73c..03b4fef44949a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefix=VARIANT0 %s
-; RUN: llc -mtriple=amdgcn -mattr=+auto-waitcnt-before-barrier < %s | FileCheck --check-prefix=VARIANT1 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefix=VARIANT0 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -mattr=+auto-waitcnt-before-barrier < %s | FileCheck --check-prefix=VARIANT1 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=VARIANT2 %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=VARIANT2-GISEL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+auto-waitcnt-before-barrier < %s | FileCheck --check-prefix=VARIANT3 %s
@@ -17,14 +17,15 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT0-NEXT:    s_mov_b32 s2, 0
 ; VARIANT0-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; VARIANT0-NEXT:    v_mov_b32_e32 v2, 0
-; VARIANT0-NEXT:    v_not_b32_e32 v3, v0
 ; VARIANT0-NEXT:    s_waitcnt lgkmcnt(0)
 ; VARIANT0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
-; VARIANT0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; VARIANT0-NEXT:    s_barrier
-; VARIANT0-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
+; VARIANT0-NEXT:    s_waitcnt expcnt(0)
+; VARIANT0-NEXT:    v_not_b32_e32 v0, v0
+; VARIANT0-NEXT:    v_add_i32_e32 v3, vcc, s4, v0
 ; VARIANT0-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; VARIANT0-NEXT:    v_lshl_b64 v[3:4], v[3:4], 2
+; VARIANT0-NEXT:    s_waitcnt vmcnt(0)
+; VARIANT0-NEXT:    s_barrier
 ; VARIANT0-NEXT:    buffer_load_dword v0, v[3:4], s[0:3], 0 addr64
 ; VARIANT0-NEXT:    s_waitcnt vmcnt(0)
 ; VARIANT0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
@@ -38,14 +39,14 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT1-NEXT:    s_mov_b32 s2, 0
 ; VARIANT1-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; VARIANT1-NEXT:    v_mov_b32_e32 v2, 0
-; VARIANT1-NEXT:    v_not_b32_e32 v3, v0
 ; VARIANT1-NEXT:    s_waitcnt lgkmcnt(0)
 ; VARIANT1-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
-; VARIANT1-NEXT:    s_barrier
-; VARIANT1-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
+; VARIANT1-NEXT:    s_waitcnt expcnt(0)
+; VARIANT1-NEXT:    v_not_b32_e32 v0, v0
+; VARIANT1-NEXT:    v_add_i32_e32 v3, vcc, s4, v0
 ; VARIANT1-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; VARIANT1-NEXT:    v_lshl_b64 v[3:4], v[3:4], 2
-; VARIANT1-NEXT:    s_waitcnt expcnt(0)
+; VARIANT1-NEXT:    s_barrier
 ; VARIANT1-NEXT:    buffer_load_dword v0, v[3:4], s[0:3], 0 addr64
 ; VARIANT1-NEXT:    s_waitcnt vmcnt(0)
 ; VARIANT1-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll
index 8282ff3ed2fcc..9652f65a4b1f4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -show-mc-encoding < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -global-isel -mtriple=amdgcn -show-mc-encoding < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -show-mc-encoding < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx600 -show-mc-encoding < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 %s
 
 declare void @llvm.amdgcn.s.setprio(i16) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
index 12b45f06c9fd7..4a2737535defe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GCN %s
 
 define amdgpu_kernel void @test_sched_barrier() #0 {
 ; GCN-LABEL: test_sched_barrier:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
index e421e2c8ebfc4..bcb763d5a3d04 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck --check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefix=SI %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefix=VI %s
 
 define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
@@ -9,9 +9,9 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0,
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_bfe_u32 v0, v0, s3, s3
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -37,14 +37,14 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0,
 ; SI-LABEL: bfe_u32_arg_arg_imm:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    v_bfe_u32 v0, s2, v1, v0
+; SI-NEXT:    v_bfe_u32 v0, s2, v0, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -70,13 +70,13 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0,
 ; SI-LABEL: bfe_u32_arg_imm_arg:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    v_bfe_u32 v0, s2, v0, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -103,14 +103,14 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1,
 ; SI-LABEL: bfe_u32_imm_arg_arg:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_movk_i32 s8, 0x7b
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_movk_i32 s8, 0x7b
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    v_bfe_u32 v0, s8, v0, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -1529,20 +1529,23 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s2, s6
 ; SI-NEXT:    s_mov_b32 s3, s7
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_mov_b32 s4, s8
-; SI-NEXT:    s_mov_b32 s5, s9
-; SI-NEXT:    s_mov_b32 s0, s10
-; SI-NEXT:    s_mov_b32 s1, s11
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 63, v0
 ; SI-NEXT:    v_bfe_u32 v1, v0, 2, 2
 ; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: simplify_bfe_u32_multi_use_arg:
@@ -1581,12 +1584,12 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0
 define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 {
 ; SI-LABEL: lshr_and:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_u32 s4, s2, 0x30006
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bfe_u32 s4, s6, 0x30006
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -1614,9 +1617,9 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_lshr_b32 s4, s4, s5
+; SI-NEXT:    s_and_b32 s4, s4, 7
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_lshr_b32 s2, s4, s5
-; SI-NEXT:    s_and_b32 s4, s2, 7
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -1643,12 +1646,12 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0
 define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 {
 ; SI-LABEL: and_lshr:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_u32 s4, s2, 0x30006
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bfe_u32 s4, s6, 0x30006
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -1673,12 +1676,12 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 {
 define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 {
 ; SI-LABEL: and_lshr2:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_u32 s4, s2, 0x30006
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bfe_u32 s4, s6, 0x30006
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -1703,12 +1706,12 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 {
 define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 {
 ; SI-LABEL: shl_lshr:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_u32 s4, s2, 0x150002
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bfe_u32 s4, s6, 0x150002
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 4b08dcbf94458..71e8c9900611b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
@@ -9,22 +9,23 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; SI-LABEL: umulo_i64_v_v:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mul_hi_u32 v4, v1, v2
-; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
-; SI-NEXT:    v_mul_hi_u32 v6, v0, v3
-; SI-NEXT:    v_mul_lo_u32 v7, v0, v3
-; SI-NEXT:    v_mul_hi_u32 v8, v0, v2
-; SI-NEXT:    v_mul_hi_u32 v9, v1, v3
-; SI-NEXT:    v_mul_lo_u32 v3, v1, v3
+; SI-NEXT:    v_mul_lo_u32 v4, v0, v3
+; SI-NEXT:    v_mul_hi_u32 v6, v0, v2
+; SI-NEXT:    v_mul_hi_u32 v8, v0, v3
+; SI-NEXT:    v_mul_lo_u32 v7, v1, v2
+; SI-NEXT:    v_mul_hi_u32 v5, v1, v2
+; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
+; SI-NEXT:    v_mul_hi_u32 v8, v1, v3
+; SI-NEXT:    v_mul_lo_u32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; SI-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v8, vcc
+; SI-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
 ; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
-; SI-NEXT:    v_add_i32_e32 v1, vcc, v8, v7
-; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v6, vcc
-; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; SI-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v9, vcc
-; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[1:2]
+; SI-NEXT:    v_mov_b32_e32 v1, v4
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -134,37 +135,36 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; SI-LABEL: smulo_i64_v_v:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mul_hi_u32 v6, v1, v2
-; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
-; SI-NEXT:    v_mul_hi_u32 v7, v0, v3
-; SI-NEXT:    v_mul_lo_u32 v8, v0, v3
-; SI-NEXT:    v_mul_hi_u32 v9, v0, v2
-; SI-NEXT:    v_mul_hi_i32 v10, v1, v3
-; SI-NEXT:    v_mul_lo_u32 v11, v1, v3
-; SI-NEXT:    v_mul_lo_u32 v4, v0, v2
-; SI-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; SI-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; SI-NEXT:    v_addc_u32_e32 v7, vcc, v7, v6, vcc
-; SI-NEXT:    v_addc_u32_e32 v8, vcc, 0, v10, vcc
-; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; SI-NEXT:    v_add_i32_e32 v9, vcc, v7, v11
-; SI-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
-; SI-NEXT:    v_mov_b32_e32 v7, v6
-; SI-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
-; SI-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v8, vcc
+; SI-NEXT:    v_mul_lo_u32 v4, v0, v3
+; SI-NEXT:    v_mul_hi_u32 v6, v0, v2
+; SI-NEXT:    v_mul_hi_u32 v8, v0, v3
+; SI-NEXT:    v_mul_lo_u32 v7, v1, v2
+; SI-NEXT:    v_mul_hi_u32 v5, v1, v2
+; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
+; SI-NEXT:    v_mul_hi_i32 v8, v1, v3
+; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; SI-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; SI-NEXT:    v_mul_lo_u32 v6, v1, v3
+; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
+; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v7, vcc
+; SI-NEXT:    v_sub_i32_e32 v7, vcc, v5, v2
+; SI-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v6, vcc
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
-; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v10, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; SI-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
-; SI-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v8, vcc
+; SI-NEXT:    v_sub_i32_e32 v7, vcc, v5, v0
+; SI-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v1, vcc
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
-; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7]
+; SI-NEXT:    v_cndmask_b32_e32 v6, v1, v6, vcc
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v1
+; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[5:6], v[1:2]
+; SI-NEXT:    v_mov_b32_e32 v1, v4
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, v4
-; SI-NEXT:    v_mov_b32_e32 v1, v5
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: smulo_i64_v_v:
@@ -333,34 +333,34 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
 ; SI-LABEL: umulo_i64_s:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mul_hi_u32 v1, s1, v0
-; SI-NEXT:    s_mul_i32 s4, s1, s2
-; SI-NEXT:    v_mov_b32_e32 v2, s3
-; SI-NEXT:    v_mul_hi_u32 v3, s0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    v_mul_hi_u32 v2, s0, v0
+; SI-NEXT:    v_mul_hi_u32 v3, s0, v1
 ; SI-NEXT:    s_mul_i32 s5, s0, s3
-; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
-; SI-NEXT:    v_mul_hi_u32 v2, s1, v2
-; SI-NEXT:    s_mul_i32 s3, s1, s3
-; SI-NEXT:    s_mul_i32 s2, s0, s2
-; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v0
+; SI-NEXT:    v_mul_hi_u32 v0, s1, v0
+; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v2
+; SI-NEXT:    v_mul_hi_u32 v1, s1, v1
+; SI-NEXT:    s_mul_i32 s4, s1, s2
 ; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
-; SI-NEXT:    v_addc_u32_e64 v1, s[0:1], v3, v1, vcc
-; SI-NEXT:    v_add_i32_e32 v3, vcc, s4, v0
-; SI-NEXT:    v_addc_u32_e64 v2, vcc, 0, v2, s[0:1]
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s3, v1
-; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; SI-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SI-NEXT:    s_mul_i32 s1, s1, s3
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SI-NEXT:    v_add_i32_e32 v2, vcc, s5, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, s4, v2
 ; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc
+; SI-NEXT:    s_mul_i32 s2, s0, s2
 ; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; SI-NEXT:    s_cselect_b32 s0, 0, s2
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: umulo_i64_s:
@@ -488,50 +488,50 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; SI-LABEL: smulo_i64_s:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mul_hi_u32 v1, s1, v0
-; SI-NEXT:    s_mul_i32 s4, s1, s2
-; SI-NEXT:    v_mov_b32_e32 v2, s3
-; SI-NEXT:    v_mul_hi_u32 v3, s0, v2
-; SI-NEXT:    s_mul_i32 s5, s0, s3
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    v_mul_hi_u32 v2, s1, v0
 ; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
-; SI-NEXT:    v_mul_hi_i32 v2, s1, v2
-; SI-NEXT:    s_mul_i32 s6, s1, s3
-; SI-NEXT:    s_mul_i32 s8, s0, s2
-; SI-NEXT:    v_readfirstlane_b32 s9, v1
-; SI-NEXT:    v_readfirstlane_b32 s10, v3
-; SI-NEXT:    v_readfirstlane_b32 s11, v0
-; SI-NEXT:    v_readfirstlane_b32 s12, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
-; SI-NEXT:    s_add_u32 s5, s11, s5
-; SI-NEXT:    v_add_i32_e32 v2, vcc, s4, v0
-; SI-NEXT:    s_addc_u32 s10, 0, s10
-; SI-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
-; SI-NEXT:    s_add_u32 s4, s5, s4
-; SI-NEXT:    v_mov_b32_e32 v1, v0
-; SI-NEXT:    s_addc_u32 s4, s10, s9
-; SI-NEXT:    s_addc_u32 s5, s12, 0
-; SI-NEXT:    s_add_u32 s4, s4, s6
+; SI-NEXT:    v_mul_hi_u32 v3, s0, v1
+; SI-NEXT:    s_mul_i32 s7, s0, s3
+; SI-NEXT:    v_mul_hi_i32 v1, s1, v1
+; SI-NEXT:    v_readfirstlane_b32 s8, v0
+; SI-NEXT:    v_readfirstlane_b32 s5, v3
+; SI-NEXT:    s_add_u32 s8, s8, s7
+; SI-NEXT:    s_mul_i32 s6, s1, s2
 ; SI-NEXT:    s_addc_u32 s5, 0, s5
-; SI-NEXT:    s_sub_u32 s2, s4, s2
-; SI-NEXT:    s_subb_u32 s6, s5, 0
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
+; SI-NEXT:    s_add_u32 s8, s8, s6
+; SI-NEXT:    s_addc_u32 s4, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s5, v1
+; SI-NEXT:    s_addc_u32 s5, s5, 0
+; SI-NEXT:    s_mul_i32 s8, s1, s3
+; SI-NEXT:    s_add_u32 s4, s4, s8
+; SI-NEXT:    s_addc_u32 s5, 0, s5
+; SI-NEXT:    s_sub_u32 s8, s4, s2
+; SI-NEXT:    s_subb_u32 s9, s5, 0
 ; SI-NEXT:    s_cmp_lt_i32 s1, 0
-; SI-NEXT:    s_cselect_b32 s1, s6, s5
-; SI-NEXT:    s_cselect_b32 s2, s2, s4
-; SI-NEXT:    s_sub_u32 s0, s2, s0
-; SI-NEXT:    s_subb_u32 s4, s1, 0
+; SI-NEXT:    s_cselect_b32 s4, s8, s4
+; SI-NEXT:    s_cselect_b32 s1, s9, s5
+; SI-NEXT:    s_sub_u32 s8, s4, s0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s7, v0
+; SI-NEXT:    s_subb_u32 s5, s1, 0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, s6, v0
 ; SI-NEXT:    s_cmp_lt_i32 s3, 0
-; SI-NEXT:    s_cselect_b32 s1, s4, s1
-; SI-NEXT:    s_cselect_b32 s0, s0, s2
-; SI-NEXT:    v_cmp_ne_u64_e32 vcc, s[0:1], v[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; SI-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
+; SI-NEXT:    s_cselect_b32 s5, s5, s1
+; SI-NEXT:    s_cselect_b32 s4, s8, s4
+; SI-NEXT:    v_mov_b32_e32 v1, v0
+; SI-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; SI-NEXT:    s_mul_i32 s2, s0, s2
 ; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
-; SI-NEXT:    s_cselect_b32 s0, 0, s8
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_cselect_b32 s0, 0, s2
+; SI-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: smulo_i64_s:
@@ -711,13 +711,13 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
 ; SI-LABEL: smulo_i64_v_4:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_lshl_b64 v[5:6], v[0:1], 2
-; SI-NEXT:    v_alignbit_b32 v4, v1, v0, 30
-; SI-NEXT:    v_ashr_i64 v[2:3], v[5:6], 2
-; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
+; SI-NEXT:    v_lshl_b64 v[4:5], v[0:1], 2
+; SI-NEXT:    v_alignbit_b32 v3, v1, v0, 30
+; SI-NEXT:    v_ashr_i64 v[5:6], v[4:5], 2
+; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[5:6], v[0:1]
+; SI-NEXT:    v_mov_b32_e32 v0, v4
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, v5
-; SI-NEXT:    v_mov_b32_e32 v1, v4
+; SI-NEXT:    v_mov_b32_e32 v1, v3
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: smulo_i64_v_4:
@@ -786,8 +786,8 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
 ; SI-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
 ; SI-NEXT:    v_mov_b32_e32 v6, v0
 ; SI-NEXT:    v_lshl_b64 v[4:5], v[0:1], 2
-; SI-NEXT:    v_alignbit_b32 v3, v1, v0, 30
 ; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
+; SI-NEXT:    v_alignbit_b32 v3, v1, v0, 30
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, v4
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index 08953caee405c..18caa2297288f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefixes=SI,GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefixes=SI,GCN,FUNC %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga  < %s | FileCheck --check-prefixes=VI,GCN,FUNC %s
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600,FUNC %s
 
@@ -126,9 +126,9 @@ define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) {
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x6
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mul_i32 s4, s6, s7
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -166,13 +166,13 @@ entry:
 define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) {
 ; SI-LABEL: local_size_xz:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dword s2, s[4:5], 0x6
-; SI-NEXT:    s_load_dword s6, s[4:5], 0x8
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x6
+; SI-NEXT:    s_load_dword s7, s[4:5], 0x8
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mul_i32 s4, s2, s6
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mul_i32 s4, s6, s7
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -213,9 +213,9 @@ define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) {
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mul_i32 s0, s0, s1
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
@@ -255,13 +255,13 @@ define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) {
 ; SI-LABEL: local_size_xyz:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x6
-; SI-NEXT:    s_load_dword s2, s[4:5], 0x8
+; SI-NEXT:    s_load_dword s8, s[4:5], 0x8
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mul_i32 s4, s6, s7
-; SI-NEXT:    s_add_i32 s4, s4, s2
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_add_i32 s4, s4, s8
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
index 2623d8e3b970e..242744b1cb809 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
 
@@ -8,11 +8,11 @@ define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocaptur
 ; GFX6-LABEL: constant_load_v8f32:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s15, 0xf000
+; GFX6-NEXT:    s_mov_b32 s14, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_load_dword s16, s[10:11], 0x0
 ; GFX6-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX6-NEXT:    s_mov_b32 s15, 0xf000
-; GFX6-NEXT:    s_mov_b32 s14, -1
 ; GFX6-NEXT:    s_mov_b32 s12, s10
 ; GFX6-NEXT:    s_mov_b32 s13, s11
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index 001d7487b51b4..90a0e43fb27d0 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GFX6-NOHSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6-NOHSA %s
 ; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GFX7-HSA %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8-NOHSA %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
@@ -71,24 +71,24 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu
 ; GFX6-NOHSA:       ; %bb.0: ; %entry
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x9
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dwordx2 s[24:25], s[18:19], 0x0
+; GFX6-NOHSA-NEXT:    s_load_dwordx2 s[20:21], s[18:19], 0x0
 ; GFX6-NOHSA-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s23, 0xf000
-; GFX6-NOHSA-NEXT:    s_mov_b32 s22, -1
-; GFX6-NOHSA-NEXT:    s_mov_b32 s20, s18
-; GFX6-NOHSA-NEXT:    s_mov_b32 s21, s19
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s24
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s25
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s20
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s21
 ; GFX6-NOHSA-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
+; GFX6-NOHSA-NEXT:    s_mov_b32 s0, s18
 ; GFX6-NOHSA-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
+; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    v_add_f64 v[0:1], s[4:5], v[0:1]
+; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX6-NOHSA-NEXT:    s_mov_b32 s1, s19
 ; GFX6-NOHSA-NEXT:    v_add_f64 v[0:1], s[8:9], v[0:1]
 ; GFX6-NOHSA-NEXT:    v_add_f64 v[0:1], s[10:11], v[0:1]
 ; GFX6-NOHSA-NEXT:    v_add_f64 v[0:1], s[12:13], v[0:1]
 ; GFX6-NOHSA-NEXT:    v_add_f64 v[0:1], s[14:15], v[0:1]
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[20:23], 0
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
 ; GFX7-HSA-LABEL: constant_load_2v4f64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 01bdaacbabac2..589ea74948d2a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
@@ -2431,16 +2431,16 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ; GFX6-NEXT:    s_bfe_u32 s31, s4, 0x10016
 ; GFX6-NEXT:    s_bfe_u32 s33, s4, 0x10014
 ; GFX6-NEXT:    s_bfe_u32 s34, s4, 0x1001a
-; GFX6-NEXT:    s_bfe_u32 s35, s4, 0x1001e
-; GFX6-NEXT:    s_bfe_u32 s36, s4, 0x1001c
-; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x10018
-; GFX6-NEXT:    v_mov_b32_e32 v0, s36
+; GFX6-NEXT:    s_bfe_u32 s35, s4, 0x10018
+; GFX6-NEXT:    s_bfe_u32 s36, s4, 0x1001e
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x1001c
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s20
-; GFX6-NEXT:    v_mov_b32_e32 v2, s35
+; GFX6-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s19
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, s35
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s17
@@ -2851,18 +2851,18 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ; GFX6-NEXT:    s_bfe_i32 s29, s4, 0x1001b
 ; GFX6-NEXT:    s_bfe_i32 s30, s4, 0x1001a
 ; GFX6-NEXT:    s_bfe_i32 s31, s4, 0x10019
-; GFX6-NEXT:    s_ashr_i32 s33, s4, 31
-; GFX6-NEXT:    s_bfe_i32 s34, s4, 0x1001e
-; GFX6-NEXT:    s_bfe_i32 s35, s4, 0x1001d
-; GFX6-NEXT:    s_bfe_i32 s36, s4, 0x1001c
-; GFX6-NEXT:    s_bfe_i32 s4, s4, 0x10018
-; GFX6-NEXT:    v_mov_b32_e32 v0, s36
-; GFX6-NEXT:    v_mov_b32_e32 v1, s35
-; GFX6-NEXT:    v_mov_b32_e32 v2, s34
-; GFX6-NEXT:    v_mov_b32_e32 v3, s33
+; GFX6-NEXT:    s_bfe_i32 s33, s4, 0x10018
+; GFX6-NEXT:    s_ashr_i32 s34, s4, 31
+; GFX6-NEXT:    s_bfe_i32 s35, s4, 0x1001e
+; GFX6-NEXT:    s_bfe_i32 s36, s4, 0x1001d
+; GFX6-NEXT:    s_bfe_i32 s4, s4, 0x1001c
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s36
+; GFX6-NEXT:    v_mov_b32_e32 v2, s35
+; GFX6-NEXT:    v_mov_b32_e32 v3, s34
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, s33
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s31
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s30
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s29
@@ -3287,33 +3287,33 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX6-NEXT:    s_bfe_u32 s8, s2, 0x1000b
 ; GFX6-NEXT:    s_bfe_u32 s9, s2, 0x10009
 ; GFX6-NEXT:    s_bfe_u32 s10, s2, 0x1000f
-; GFX6-NEXT:    s_bfe_u32 s13, s2, 0x1000d
-; GFX6-NEXT:    s_bfe_u32 s14, s2, 0x10013
-; GFX6-NEXT:    s_bfe_u32 s15, s2, 0x10011
-; GFX6-NEXT:    s_bfe_u32 s16, s2, 0x10017
-; GFX6-NEXT:    s_bfe_u32 s17, s2, 0x10015
-; GFX6-NEXT:    s_bfe_u32 s18, s2, 0x1001b
-; GFX6-NEXT:    s_bfe_u32 s19, s2, 0x10019
-; GFX6-NEXT:    s_lshr_b32 s20, s2, 31
-; GFX6-NEXT:    s_bfe_u32 s21, s2, 0x1001d
-; GFX6-NEXT:    s_bfe_u32 s22, s3, 0x10003
-; GFX6-NEXT:    s_bfe_u32 s23, s3, 0x10001
-; GFX6-NEXT:    s_bfe_u32 s24, s3, 0x10007
-; GFX6-NEXT:    s_bfe_u32 s25, s3, 0x10005
-; GFX6-NEXT:    s_bfe_u32 s26, s3, 0x1000b
-; GFX6-NEXT:    s_bfe_u32 s27, s3, 0x10009
-; GFX6-NEXT:    s_bfe_u32 s28, s3, 0x1000f
-; GFX6-NEXT:    s_bfe_u32 s29, s3, 0x1000d
-; GFX6-NEXT:    s_bfe_u32 s30, s3, 0x10013
-; GFX6-NEXT:    s_bfe_u32 s31, s3, 0x10011
-; GFX6-NEXT:    s_bfe_u32 s33, s3, 0x10017
-; GFX6-NEXT:    s_bfe_u32 s34, s3, 0x10015
-; GFX6-NEXT:    s_bfe_u32 s35, s3, 0x1001b
-; GFX6-NEXT:    s_bfe_u32 s36, s3, 0x10019
-; GFX6-NEXT:    s_lshr_b32 s37, s3, 31
-; GFX6-NEXT:    s_bfe_u32 s38, s3, 0x1001d
-; GFX6-NEXT:    s_and_b32 s12, s2, 1
-; GFX6-NEXT:    s_bfe_u32 s11, s2, 0x10002
+; GFX6-NEXT:    s_bfe_u32 s11, s2, 0x1000d
+; GFX6-NEXT:    s_bfe_u32 s12, s2, 0x10013
+; GFX6-NEXT:    s_bfe_u32 s13, s2, 0x10011
+; GFX6-NEXT:    s_bfe_u32 s14, s2, 0x10017
+; GFX6-NEXT:    s_bfe_u32 s15, s2, 0x10015
+; GFX6-NEXT:    s_bfe_u32 s16, s2, 0x1001b
+; GFX6-NEXT:    s_bfe_u32 s17, s2, 0x10019
+; GFX6-NEXT:    s_lshr_b32 s18, s2, 31
+; GFX6-NEXT:    s_bfe_u32 s19, s2, 0x1001d
+; GFX6-NEXT:    s_bfe_u32 s20, s3, 0x10003
+; GFX6-NEXT:    s_bfe_u32 s21, s3, 0x10001
+; GFX6-NEXT:    s_bfe_u32 s22, s3, 0x10007
+; GFX6-NEXT:    s_bfe_u32 s23, s3, 0x10005
+; GFX6-NEXT:    s_bfe_u32 s24, s3, 0x1000b
+; GFX6-NEXT:    s_bfe_u32 s25, s3, 0x10009
+; GFX6-NEXT:    s_bfe_u32 s26, s3, 0x1000f
+; GFX6-NEXT:    s_bfe_u32 s27, s3, 0x1000d
+; GFX6-NEXT:    s_bfe_u32 s28, s3, 0x10013
+; GFX6-NEXT:    s_bfe_u32 s29, s3, 0x10011
+; GFX6-NEXT:    s_bfe_u32 s30, s3, 0x10017
+; GFX6-NEXT:    s_bfe_u32 s31, s3, 0x10015
+; GFX6-NEXT:    s_bfe_u32 s33, s3, 0x1001b
+; GFX6-NEXT:    s_bfe_u32 s34, s3, 0x10019
+; GFX6-NEXT:    s_lshr_b32 s35, s3, 31
+; GFX6-NEXT:    s_bfe_u32 s36, s3, 0x1001d
+; GFX6-NEXT:    s_and_b32 s37, s2, 1
+; GFX6-NEXT:    s_bfe_u32 s38, s2, 0x10002
 ; GFX6-NEXT:    s_bfe_u32 s39, s2, 0x10006
 ; GFX6-NEXT:    s_bfe_u32 s40, s2, 0x10004
 ; GFX6-NEXT:    s_bfe_u32 s41, s2, 0x1000a
@@ -3332,8 +3332,9 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX6-NEXT:    s_bfe_u32 s54, s3, 0x10002
 ; GFX6-NEXT:    s_bfe_u32 s55, s3, 0x10006
 ; GFX6-NEXT:    s_bfe_u32 s56, s3, 0x10004
-; GFX6-NEXT:    s_bfe_u32 s57, s3, 0x10008
-; GFX6-NEXT:    s_bfe_u32 s58, s3, 0x1000e
+; GFX6-NEXT:    s_bfe_u32 s57, s3, 0x1000a
+; GFX6-NEXT:    s_bfe_u32 s58, s3, 0x10008
+; GFX6-NEXT:    s_bfe_u32 s2, s3, 0x1000e
 ; GFX6-NEXT:    s_bfe_u32 s59, s3, 0x1000c
 ; GFX6-NEXT:    s_bfe_u32 s60, s3, 0x10012
 ; GFX6-NEXT:    s_bfe_u32 s61, s3, 0x10010
@@ -3342,36 +3343,35 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX6-NEXT:    s_bfe_u32 s64, s3, 0x1001a
 ; GFX6-NEXT:    s_bfe_u32 s65, s3, 0x10018
 ; GFX6-NEXT:    s_bfe_u32 s66, s3, 0x1001e
-; GFX6-NEXT:    s_bfe_u32 s67, s3, 0x1001c
-; GFX6-NEXT:    s_bfe_u32 s68, s3, 0x1000a
+; GFX6-NEXT:    s_bfe_u32 s3, s3, 0x1001c
+; GFX6-NEXT:    v_mov_b32_e32 v0, s3
+; GFX6-NEXT:    v_mov_b32_e32 v1, s36
+; GFX6-NEXT:    v_mov_b32_e32 v2, s66
+; GFX6-NEXT:    v_mov_b32_e32 v3, s35
+; GFX6-NEXT:    v_mov_b32_e32 v18, s2
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s67
-; GFX6-NEXT:    v_mov_b32_e32 v1, s38
-; GFX6-NEXT:    v_mov_b32_e32 v2, s66
-; GFX6-NEXT:    v_mov_b32_e32 v3, s37
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s65
-; GFX6-NEXT:    v_mov_b32_e32 v5, s36
+; GFX6-NEXT:    v_mov_b32_e32 v5, s34
 ; GFX6-NEXT:    v_mov_b32_e32 v6, s64
-; GFX6-NEXT:    v_mov_b32_e32 v7, s35
+; GFX6-NEXT:    v_mov_b32_e32 v7, s33
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s58
+; GFX6-NEXT:    v_mov_b32_e32 v1, s25
+; GFX6-NEXT:    v_mov_b32_e32 v2, s57
+; GFX6-NEXT:    v_mov_b32_e32 v3, s24
 ; GFX6-NEXT:    v_mov_b32_e32 v8, s63
-; GFX6-NEXT:    v_mov_b32_e32 v9, s34
+; GFX6-NEXT:    v_mov_b32_e32 v9, s31
 ; GFX6-NEXT:    v_mov_b32_e32 v10, s62
-; GFX6-NEXT:    v_mov_b32_e32 v11, s33
+; GFX6-NEXT:    v_mov_b32_e32 v11, s30
 ; GFX6-NEXT:    v_mov_b32_e32 v12, s61
-; GFX6-NEXT:    v_mov_b32_e32 v13, s31
+; GFX6-NEXT:    v_mov_b32_e32 v13, s29
 ; GFX6-NEXT:    v_mov_b32_e32 v14, s60
-; GFX6-NEXT:    v_mov_b32_e32 v15, s30
+; GFX6-NEXT:    v_mov_b32_e32 v15, s28
 ; GFX6-NEXT:    v_mov_b32_e32 v16, s59
-; GFX6-NEXT:    v_mov_b32_e32 v17, s29
-; GFX6-NEXT:    v_mov_b32_e32 v18, s58
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s57
-; GFX6-NEXT:    v_mov_b32_e32 v19, s28
-; GFX6-NEXT:    v_mov_b32_e32 v1, s27
-; GFX6-NEXT:    v_mov_b32_e32 v2, s68
-; GFX6-NEXT:    v_mov_b32_e32 v3, s26
+; GFX6-NEXT:    v_mov_b32_e32 v17, s27
+; GFX6-NEXT:    v_mov_b32_e32 v19, s26
 ; GFX6-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
 ; GFX6-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
 ; GFX6-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
@@ -3379,43 +3379,43 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s56
-; GFX6-NEXT:    v_mov_b32_e32 v1, s25
+; GFX6-NEXT:    v_mov_b32_e32 v1, s23
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s55
-; GFX6-NEXT:    v_mov_b32_e32 v3, s24
+; GFX6-NEXT:    v_mov_b32_e32 v3, s22
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s53
-; GFX6-NEXT:    v_mov_b32_e32 v1, s23
+; GFX6-NEXT:    v_mov_b32_e32 v1, s21
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s54
-; GFX6-NEXT:    v_mov_b32_e32 v3, s22
+; GFX6-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s52
-; GFX6-NEXT:    v_mov_b32_e32 v1, s21
+; GFX6-NEXT:    v_mov_b32_e32 v1, s19
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s51
-; GFX6-NEXT:    v_mov_b32_e32 v3, s20
+; GFX6-NEXT:    v_mov_b32_e32 v3, s18
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s50
-; GFX6-NEXT:    v_mov_b32_e32 v1, s19
+; GFX6-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s49
-; GFX6-NEXT:    v_mov_b32_e32 v3, s18
+; GFX6-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s48
-; GFX6-NEXT:    v_mov_b32_e32 v1, s17
+; GFX6-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s47
-; GFX6-NEXT:    v_mov_b32_e32 v3, s16
+; GFX6-NEXT:    v_mov_b32_e32 v3, s14
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s46
-; GFX6-NEXT:    v_mov_b32_e32 v1, s15
+; GFX6-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s45
-; GFX6-NEXT:    v_mov_b32_e32 v3, s14
+; GFX6-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s44
-; GFX6-NEXT:    v_mov_b32_e32 v1, s13
+; GFX6-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s43
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
@@ -3432,9 +3432,9 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s12
+; GFX6-NEXT:    v_mov_b32_e32 v0, s37
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
-; GFX6-NEXT:    v_mov_b32_e32 v2, s11
+; GFX6-NEXT:    v_mov_b32_e32 v2, s38
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -4129,7 +4129,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX6-NEXT:    s_bfe_i32 s46, s3, 0x1000a
 ; GFX6-NEXT:    s_bfe_i32 s47, s3, 0x10009
 ; GFX6-NEXT:    s_bfe_i32 s48, s3, 0x10008
-; GFX6-NEXT:    s_bfe_i32 s49, s3, 0x1000e
+; GFX6-NEXT:    s_bfe_i32 s49, s3, 0x1000f
+; GFX6-NEXT:    s_bfe_i32 s2, s3, 0x1000e
 ; GFX6-NEXT:    s_bfe_i32 s50, s3, 0x1000d
 ; GFX6-NEXT:    s_bfe_i32 s51, s3, 0x1000c
 ; GFX6-NEXT:    s_bfe_i32 s52, s3, 0x10013
@@ -4147,18 +4148,24 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX6-NEXT:    s_ashr_i32 s64, s3, 31
 ; GFX6-NEXT:    s_bfe_i32 s65, s3, 0x1001e
 ; GFX6-NEXT:    s_bfe_i32 s66, s3, 0x1001d
-; GFX6-NEXT:    s_bfe_i32 s67, s3, 0x1001c
-; GFX6-NEXT:    s_bfe_i32 s68, s3, 0x1000f
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s67
+; GFX6-NEXT:    s_bfe_i32 s3, s3, 0x1001c
+; GFX6-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s66
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s65
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s64
+; GFX6-NEXT:    v_mov_b32_e32 v18, s2
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s63
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s62
 ; GFX6-NEXT:    v_mov_b32_e32 v6, s61
 ; GFX6-NEXT:    v_mov_b32_e32 v7, s60
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s48
+; GFX6-NEXT:    v_mov_b32_e32 v1, s47
+; GFX6-NEXT:    v_mov_b32_e32 v2, s46
+; GFX6-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX6-NEXT:    v_mov_b32_e32 v8, s59
 ; GFX6-NEXT:    v_mov_b32_e32 v9, s58
 ; GFX6-NEXT:    v_mov_b32_e32 v10, s57
@@ -4169,14 +4176,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX6-NEXT:    v_mov_b32_e32 v15, s52
 ; GFX6-NEXT:    v_mov_b32_e32 v16, s51
 ; GFX6-NEXT:    v_mov_b32_e32 v17, s50
-; GFX6-NEXT:    v_mov_b32_e32 v18, s49
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s48
-; GFX6-NEXT:    v_mov_b32_e32 v19, s68
-; GFX6-NEXT:    v_mov_b32_e32 v1, s47
-; GFX6-NEXT:    v_mov_b32_e32 v2, s46
-; GFX6-NEXT:    v_mov_b32_e32 v3, s45
+; GFX6-NEXT:    v_mov_b32_e32 v19, s49
 ; GFX6-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
 ; GFX6-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
 ; GFX6-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
@@ -4963,9 +4963,9 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt
 ; GFX6-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -5129,9 +5129,9 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out
 ; GFX6-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -5296,10 +5296,10 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -5421,8 +5421,8 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -5519,10 +5519,10 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ; GFX6-NEXT:    s_mov_b32 s9, s3
 ; GFX6-NEXT:    buffer_load_ubyte v4, off, s[8:11], 0
 ; GFX6-NEXT:    v_mov_b32_e32 v5, 0
-; GFX6-NEXT:    v_mov_b32_e32 v1, v5
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v4
 ; GFX6-NEXT:    v_bfe_u32 v2, v4, 1, 1
@@ -5675,12 +5675,12 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 2, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
+; GFX6-NEXT:    v_bfe_i32 v4, v3, 0, 1
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX6-NEXT:    v_bfe_i32 v4, v3, 0, 1
-; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -5801,10 +5801,10 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; GFX6-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v4, 1, v0
 ; GFX6-NEXT:    v_bfe_u32 v6, v0, 1, 1
@@ -5977,14 +5977,14 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 2, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 3, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
-; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 1
 ; GFX6-NEXT:    v_bfe_i32 v6, v4, 0, 1
 ; GFX6-NEXT:    v_bfe_i32 v4, v3, 0, 1
-; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX6-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -6120,14 +6120,14 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; GFX6-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_mov_b32 s1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v13, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v15, v1
-; GFX6-NEXT:    s_mov_b32 s0, s4
-; GFX6-NEXT:    s_mov_b32 s1, s5
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_bfe_u32 v14, v0, 1, 1
 ; GFX6-NEXT:    v_bfe_u32 v10, v0, 3, 1
@@ -6317,22 +6317,22 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 2, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 3, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
+; GFX6-NEXT:    v_bfe_i32 v14, v5, 0, 1
+; GFX6-NEXT:    v_bfe_i32 v12, v3, 0, 1
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 1
 ; GFX6-NEXT:    v_bfe_i32 v6, v6, 0, 1
 ; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 1
 ; GFX6-NEXT:    v_bfe_i32 v10, v8, 0, 1
 ; GFX6-NEXT:    v_bfe_i32 v8, v7, 0, 1
-; GFX6-NEXT:    v_bfe_i32 v14, v5, 0, 1
-; GFX6-NEXT:    v_bfe_i32 v12, v3, 0, 1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX6-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GFX6-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
-; GFX6-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
 ; GFX6-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
 ; GFX6-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
 ; GFX6-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
@@ -6546,14 +6546,16 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    s_mov_b32 s10, s2
-; GFX6-NEXT:    s_mov_b32 s11, s3
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s8, s6
-; GFX6-NEXT:    s_mov_b32 s9, s7
-; GFX6-NEXT:    buffer_load_ushort v29, off, s[8:11], 0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s7
+; GFX6-NEXT:    s_mov_b32 s6, s2
+; GFX6-NEXT:    s_mov_b32 s7, s3
+; GFX6-NEXT:    buffer_load_ushort v29, off, s[4:7], 0
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v1
@@ -6568,8 +6570,6 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX6-NEXT:    v_mov_b32_e32 v24, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v26, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v28, v1
-; GFX6-NEXT:    s_mov_b32 s0, s4
-; GFX6-NEXT:    s_mov_b32 s1, s5
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_bfe_u32 v2, v29, 11, 1
 ; GFX6-NEXT:    v_bfe_u32 v0, v29, 10, 1
@@ -6582,17 +6582,17 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_bfe_u32 v6, v29, 14, 1
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
+; GFX6-NEXT:    v_bfe_u32 v11, v29, 13, 1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_bfe_u32 v9, v29, 12, 1
 ; GFX6-NEXT:    v_bfe_u32 v27, v29, 5, 1
 ; GFX6-NEXT:    v_bfe_u32 v23, v29, 7, 1
 ; GFX6-NEXT:    v_bfe_u32 v19, v29, 1, 1
 ; GFX6-NEXT:    v_bfe_u32 v15, v29, 3, 1
-; GFX6-NEXT:    v_bfe_u32 v11, v29, 13, 1
 ; GFX6-NEXT:    v_bfe_u32 v25, v29, 4, 1
 ; GFX6-NEXT:    v_bfe_u32 v21, v29, 6, 1
 ; GFX6-NEXT:    v_and_b32_e32 v17, 1, v29
 ; GFX6-NEXT:    v_bfe_u32 v13, v29, 2, 1
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_bfe_u32 v9, v29, 12, 1
 ; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:96
 ; GFX6-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:16
 ; GFX6-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0
@@ -6940,46 +6940,48 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 14, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 15, v1
+; GFX6-NEXT:    v_bfe_i32 v5, v4, 0, 1
+; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v8, 13, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v11, 10, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v12, 11, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v14, 8, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v16, 9, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v15, 6, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 5, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 2, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 3, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v13, 1, v1
-; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX6-NEXT:    v_bfe_i32 v5, v4, 0, 1
-; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 5, v1
 ; GFX6-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_bfe_i32 v6, v10, 0, 1
 ; GFX6-NEXT:    v_bfe_i32 v4, v9, 0, 1
 ; GFX6-NEXT:    v_bfe_i32 v9, v8, 0, 1
 ; GFX6-NEXT:    v_bfe_i32 v7, v7, 0, 1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v11, 10, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v12, 11, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v14, 8, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v16, 9, v1
+; GFX6-NEXT:    v_bfe_i32 v6, v10, 0, 1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; GFX6-NEXT:    v_lshrrev_b32_e32 v15, 6, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 2, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 3, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v13, 1, v1
 ; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:96
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v9, v12, 0, 1
 ; GFX6-NEXT:    v_bfe_i32 v7, v11, 0, 1
-; GFX6-NEXT:    v_bfe_i32 v13, v13, 0, 1
 ; GFX6-NEXT:    v_bfe_i32 v11, v1, 0, 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 7, v1
+; GFX6-NEXT:    v_bfe_i32 v21, v16, 0, 1
+; GFX6-NEXT:    v_bfe_i32 v19, v14, 0, 1
+; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 1
+; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX6-NEXT:    v_bfe_i32 v13, v13, 0, 1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80
 ; GFX6-NEXT:    v_bfe_i32 v17, v1, 0, 1
 ; GFX6-NEXT:    v_bfe_i32 v15, v15, 0, 1
-; GFX6-NEXT:    v_bfe_i32 v21, v16, 0, 1
-; GFX6-NEXT:    v_bfe_i32 v19, v14, 0, 1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
+; GFX6-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
@@ -6989,8 +6991,6 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GFX6-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
-; GFX6-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
 ; GFX6-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:64
 ; GFX6-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
 ; GFX6-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
@@ -7349,12 +7349,12 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX6-LABEL: constant_zextload_v32i1_to_v32i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_bfe_u32 s5, s4, 0x10001
 ; GFX6-NEXT:    s_bfe_u32 s6, s4, 0x10003
@@ -7386,13 +7386,13 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX6-NEXT:    s_bfe_u32 s33, s4, 0x10016
 ; GFX6-NEXT:    s_bfe_u32 s34, s4, 0x10018
 ; GFX6-NEXT:    s_bfe_u32 s35, s4, 0x1001a
-; GFX6-NEXT:    s_bfe_u32 s36, s4, 0x1001e
-; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x1001c
-; GFX6-NEXT:    v_mov_b32_e32 v0, s36
+; GFX6-NEXT:    s_bfe_u32 s36, s4, 0x1001c
+; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x1001e
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, s36
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s19
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -7950,158 +7950,157 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshr_b32 s38, s4, 30
-; GFX6-NEXT:    s_lshr_b32 s40, s4, 31
-; GFX6-NEXT:    s_lshr_b32 s34, s4, 28
-; GFX6-NEXT:    s_lshr_b32 s36, s4, 29
-; GFX6-NEXT:    s_lshr_b32 s28, s4, 26
-; GFX6-NEXT:    s_lshr_b32 s30, s4, 27
-; GFX6-NEXT:    s_lshr_b32 s24, s4, 24
-; GFX6-NEXT:    s_lshr_b32 s26, s4, 25
-; GFX6-NEXT:    s_lshr_b32 s20, s4, 22
-; GFX6-NEXT:    s_lshr_b32 s22, s4, 23
-; GFX6-NEXT:    s_lshr_b32 s18, s4, 20
-; GFX6-NEXT:    s_lshr_b32 s6, s4, 21
-; GFX6-NEXT:    s_lshr_b32 s8, s4, 18
-; GFX6-NEXT:    s_lshr_b32 s10, s4, 19
-; GFX6-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX6-NEXT:    s_lshr_b32 s14, s4, 17
-; GFX6-NEXT:    s_lshr_b32 s16, s4, 14
-; GFX6-NEXT:    s_bfe_i64 s[44:45], s[4:5], 0x10000
-; GFX6-NEXT:    s_lshr_b32 s42, s4, 15
-; GFX6-NEXT:    v_mov_b32_e32 v0, s44
-; GFX6-NEXT:    v_mov_b32_e32 v1, s45
-; GFX6-NEXT:    s_lshr_b32 s44, s4, 12
-; GFX6-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s42, s4, 30
+; GFX6-NEXT:    s_lshr_b32 s44, s4, 31
+; GFX6-NEXT:    s_lshr_b32 s40, s4, 28
+; GFX6-NEXT:    s_lshr_b32 s38, s4, 29
+; GFX6-NEXT:    s_lshr_b32 s22, s4, 20
+; GFX6-NEXT:    s_lshr_b32 s20, s4, 21
+; GFX6-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s36, s4, 26
+; GFX6-NEXT:    s_lshr_b32 s34, s4, 27
+; GFX6-NEXT:    s_lshr_b32 s30, s4, 24
+; GFX6-NEXT:    s_lshr_b32 s28, s4, 25
+; GFX6-NEXT:    s_lshr_b32 s26, s4, 22
+; GFX6-NEXT:    s_lshr_b32 s24, s4, 23
+; GFX6-NEXT:    s_lshr_b32 s6, s4, 18
+; GFX6-NEXT:    s_lshr_b32 s8, s4, 19
+; GFX6-NEXT:    v_mov_b32_e32 v0, s42
+; GFX6-NEXT:    v_mov_b32_e32 v1, s43
+; GFX6-NEXT:    v_mov_b32_e32 v2, s44
+; GFX6-NEXT:    v_mov_b32_e32 v3, s45
 ; GFX6-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x10000
-; GFX6-NEXT:    v_mov_b32_e32 v2, s38
-; GFX6-NEXT:    v_mov_b32_e32 v3, s39
-; GFX6-NEXT:    s_lshr_b32 s38, s4, 13
+; GFX6-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s10, s4, 16
+; GFX6-NEXT:    s_lshr_b32 s12, s4, 17
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s40
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s41
-; GFX6-NEXT:    s_lshr_b32 s40, s4, 10
-; GFX6-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX6-NEXT:    v_mov_b32_e32 v6, s38
+; GFX6-NEXT:    v_mov_b32_e32 v7, s39
 ; GFX6-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x10000
-; GFX6-NEXT:    v_mov_b32_e32 v6, s34
-; GFX6-NEXT:    v_mov_b32_e32 v7, s35
-; GFX6-NEXT:    s_lshr_b32 s34, s4, 11
+; GFX6-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s22
+; GFX6-NEXT:    v_mov_b32_e32 v1, s23
+; GFX6-NEXT:    v_mov_b32_e32 v2, s20
+; GFX6-NEXT:    v_mov_b32_e32 v3, s21
+; GFX6-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s14, s4, 14
+; GFX6-NEXT:    s_lshr_b32 s16, s4, 15
 ; GFX6-NEXT:    v_mov_b32_e32 v8, s36
 ; GFX6-NEXT:    v_mov_b32_e32 v9, s37
-; GFX6-NEXT:    s_lshr_b32 s36, s4, 8
-; GFX6-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
-; GFX6-NEXT:    v_mov_b32_e32 v10, s28
-; GFX6-NEXT:    v_mov_b32_e32 v11, s29
-; GFX6-NEXT:    s_lshr_b32 s28, s4, 9
+; GFX6-NEXT:    v_mov_b32_e32 v10, s34
+; GFX6-NEXT:    v_mov_b32_e32 v11, s35
 ; GFX6-NEXT:    v_mov_b32_e32 v12, s30
 ; GFX6-NEXT:    v_mov_b32_e32 v13, s31
-; GFX6-NEXT:    s_lshr_b32 s30, s4, 6
-; GFX6-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
-; GFX6-NEXT:    v_mov_b32_e32 v14, s24
-; GFX6-NEXT:    v_mov_b32_e32 v15, s25
-; GFX6-NEXT:    s_lshr_b32 s24, s4, 7
+; GFX6-NEXT:    v_mov_b32_e32 v14, s28
+; GFX6-NEXT:    v_mov_b32_e32 v15, s29
 ; GFX6-NEXT:    v_mov_b32_e32 v16, s26
 ; GFX6-NEXT:    v_mov_b32_e32 v17, s27
-; GFX6-NEXT:    s_lshr_b32 s26, s4, 4
-; GFX6-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:240
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    v_mov_b32_e32 v3, s21
-; GFX6-NEXT:    s_lshr_b32 s20, s4, 5
-; GFX6-NEXT:    v_mov_b32_e32 v4, s22
-; GFX6-NEXT:    v_mov_b32_e32 v5, s23
-; GFX6-NEXT:    s_lshr_b32 s22, s4, 2
-; GFX6-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:224
+; GFX6-NEXT:    v_mov_b32_e32 v18, s24
+; GFX6-NEXT:    v_mov_b32_e32 v19, s25
+; GFX6-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX6-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
+; GFX6-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
+; GFX6-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
+; GFX6-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, s18
-; GFX6-NEXT:    v_mov_b32_e32 v7, s19
-; GFX6-NEXT:    s_lshr_b32 s18, s4, 3
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
-; GFX6-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s7
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    v_mov_b32_e32 v3, s9
+; GFX6-NEXT:    s_lshr_b32 s18, s4, 12
+; GFX6-NEXT:    s_lshr_b32 s42, s4, 13
 ; GFX6-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
 ; GFX6-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208
-; GFX6-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:192
-; GFX6-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176
-; GFX6-NEXT:    v_mov_b32_e32 v8, s6
-; GFX6-NEXT:    v_mov_b32_e32 v9, s7
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160
-; GFX6-NEXT:    s_waitcnt expcnt(1)
-; GFX6-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NEXT:    v_mov_b32_e32 v3, s9
-; GFX6-NEXT:    v_mov_b32_e32 v4, s10
-; GFX6-NEXT:    v_mov_b32_e32 v5, s11
-; GFX6-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:144
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s10
+; GFX6-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s13
-; GFX6-NEXT:    v_mov_b32_e32 v4, s14
-; GFX6-NEXT:    v_mov_b32_e32 v5, s15
-; GFX6-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128
+; GFX6-NEXT:    s_lshr_b32 s44, s4, 10
+; GFX6-NEXT:    s_lshr_b32 s40, s4, 11
+; GFX6-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s17
-; GFX6-NEXT:    v_mov_b32_e32 v4, s42
-; GFX6-NEXT:    v_mov_b32_e32 v5, s43
-; GFX6-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112
+; GFX6-NEXT:    s_lshr_b32 s38, s4, 8
+; GFX6-NEXT:    s_lshr_b32 s36, s4, 9
+; GFX6-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, s44
-; GFX6-NEXT:    v_mov_b32_e32 v3, s45
-; GFX6-NEXT:    v_mov_b32_e32 v4, s38
-; GFX6-NEXT:    v_mov_b32_e32 v5, s39
-; GFX6-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:96
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
+; GFX6-NEXT:    v_mov_b32_e32 v1, s19
+; GFX6-NEXT:    v_mov_b32_e32 v2, s42
+; GFX6-NEXT:    v_mov_b32_e32 v3, s43
+; GFX6-NEXT:    s_lshr_b32 s34, s4, 6
+; GFX6-NEXT:    s_lshr_b32 s30, s4, 7
+; GFX6-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s44
+; GFX6-NEXT:    v_mov_b32_e32 v1, s45
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s40
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s41
-; GFX6-NEXT:    v_mov_b32_e32 v4, s34
-; GFX6-NEXT:    v_mov_b32_e32 v5, s35
-; GFX6-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80
+; GFX6-NEXT:    s_lshr_b32 s28, s4, 4
+; GFX6-NEXT:    s_lshr_b32 s26, s4, 5
+; GFX6-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s38
+; GFX6-NEXT:    v_mov_b32_e32 v1, s39
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s37
-; GFX6-NEXT:    v_mov_b32_e32 v4, s28
-; GFX6-NEXT:    v_mov_b32_e32 v5, s29
-; GFX6-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64
+; GFX6-NEXT:    s_lshr_b32 s24, s4, 2
+; GFX6-NEXT:    s_lshr_b32 s22, s4, 3
+; GFX6-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s34
+; GFX6-NEXT:    v_mov_b32_e32 v1, s35
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s30
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s31
-; GFX6-NEXT:    v_mov_b32_e32 v4, s24
-; GFX6-NEXT:    v_mov_b32_e32 v5, s25
-; GFX6-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48
+; GFX6-NEXT:    s_lshr_b32 s46, s4, 1
+; GFX6-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s28
+; GFX6-NEXT:    v_mov_b32_e32 v1, s29
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s26
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s27
-; GFX6-NEXT:    v_mov_b32_e32 v4, s20
-; GFX6-NEXT:    v_mov_b32_e32 v5, s21
-; GFX6-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:32
+; GFX6-NEXT:    s_bfe_i64 s[20:21], s[4:5], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[4:5], s[46:47], 0x10000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s24
+; GFX6-NEXT:    v_mov_b32_e32 v1, s25
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s22
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s23
-; GFX6-NEXT:    v_mov_b32_e32 v4, s18
-; GFX6-NEXT:    v_mov_b32_e32 v5, s19
-; GFX6-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s20
+; GFX6-NEXT:    v_mov_b32_e32 v1, s21
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@@ -8768,58 +8767,61 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX6-LABEL: constant_zextload_v64i1_to_v64i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_bfe_u32 s4, s2, 0x10003
 ; GFX6-NEXT:    s_bfe_u32 s5, s2, 0x10005
-; GFX6-NEXT:    s_bfe_u32 s8, s2, 0x10007
-; GFX6-NEXT:    s_bfe_u32 s11, s2, 0x10009
-; GFX6-NEXT:    s_bfe_u32 s13, s2, 0x1000b
-; GFX6-NEXT:    s_bfe_u32 s15, s2, 0x1000d
-; GFX6-NEXT:    s_bfe_u32 s17, s2, 0x1000f
-; GFX6-NEXT:    s_bfe_u32 s19, s2, 0x10011
-; GFX6-NEXT:    s_bfe_u32 s21, s2, 0x10013
-; GFX6-NEXT:    s_bfe_u32 s23, s2, 0x10015
-; GFX6-NEXT:    s_bfe_u32 s25, s2, 0x10017
-; GFX6-NEXT:    s_bfe_u32 s27, s2, 0x10019
-; GFX6-NEXT:    s_bfe_u32 s29, s2, 0x1001b
-; GFX6-NEXT:    s_bfe_u32 s31, s2, 0x1001d
-; GFX6-NEXT:    s_lshr_b32 s34, s2, 31
-; GFX6-NEXT:    s_bfe_u32 s35, s3, 0x10003
-; GFX6-NEXT:    s_bfe_u32 s36, s3, 0x10005
-; GFX6-NEXT:    s_bfe_u32 s37, s3, 0x10007
-; GFX6-NEXT:    s_bfe_u32 s38, s3, 0x10009
-; GFX6-NEXT:    s_bfe_u32 s39, s3, 0x1000b
-; GFX6-NEXT:    s_bfe_u32 s40, s3, 0x1000d
-; GFX6-NEXT:    s_bfe_u32 s41, s3, 0x1000f
-; GFX6-NEXT:    s_bfe_u32 s42, s3, 0x10011
-; GFX6-NEXT:    s_bfe_u32 s43, s3, 0x10013
-; GFX6-NEXT:    s_bfe_u32 s44, s3, 0x10015
-; GFX6-NEXT:    s_bfe_u32 s45, s3, 0x10017
-; GFX6-NEXT:    s_bfe_u32 s46, s3, 0x10019
-; GFX6-NEXT:    s_bfe_u32 s47, s3, 0x1001b
-; GFX6-NEXT:    s_bfe_u32 s48, s3, 0x1001d
-; GFX6-NEXT:    s_lshr_b32 s49, s3, 31
-; GFX6-NEXT:    s_bfe_u32 s9, s3, 0x10001
-; GFX6-NEXT:    s_bfe_u32 s6, s2, 0x10001
-; GFX6-NEXT:    s_and_b32 s7, s2, 1
-; GFX6-NEXT:    s_and_b32 s10, s3, 1
-; GFX6-NEXT:    s_bfe_u32 s12, s2, 0x10002
-; GFX6-NEXT:    s_bfe_u32 s14, s2, 0x10004
-; GFX6-NEXT:    s_bfe_u32 s16, s2, 0x10006
-; GFX6-NEXT:    s_bfe_u32 s18, s2, 0x10008
-; GFX6-NEXT:    s_bfe_u32 s20, s2, 0x1000a
-; GFX6-NEXT:    s_bfe_u32 s22, s2, 0x1000c
-; GFX6-NEXT:    s_bfe_u32 s24, s2, 0x1000e
-; GFX6-NEXT:    s_bfe_u32 s26, s2, 0x10010
-; GFX6-NEXT:    s_bfe_u32 s28, s2, 0x10012
-; GFX6-NEXT:    s_bfe_u32 s30, s2, 0x10014
-; GFX6-NEXT:    s_bfe_u32 s33, s2, 0x10016
+; GFX6-NEXT:    s_bfe_u32 s6, s2, 0x10007
+; GFX6-NEXT:    s_bfe_u32 s10, s2, 0x10009
+; GFX6-NEXT:    s_bfe_u32 s12, s2, 0x1000b
+; GFX6-NEXT:    s_bfe_u32 s14, s2, 0x1000d
+; GFX6-NEXT:    s_bfe_u32 s16, s2, 0x1000f
+; GFX6-NEXT:    s_bfe_u32 s18, s2, 0x10011
+; GFX6-NEXT:    s_bfe_u32 s20, s2, 0x10013
+; GFX6-NEXT:    s_bfe_u32 s22, s2, 0x10015
+; GFX6-NEXT:    s_bfe_u32 s24, s2, 0x10017
+; GFX6-NEXT:    s_bfe_u32 s25, s2, 0x10019
+; GFX6-NEXT:    s_bfe_u32 s26, s2, 0x1001b
+; GFX6-NEXT:    s_bfe_u32 s27, s2, 0x1001d
+; GFX6-NEXT:    s_lshr_b32 s28, s2, 31
+; GFX6-NEXT:    s_lshr_b32 s44, s3, 31
+; GFX6-NEXT:    s_bfe_u32 s7, s2, 0x10001
+; GFX6-NEXT:    s_and_b32 s8, s2, 1
+; GFX6-NEXT:    s_bfe_u32 s13, s2, 0x10002
+; GFX6-NEXT:    s_bfe_u32 s15, s2, 0x10004
+; GFX6-NEXT:    s_bfe_u32 s17, s2, 0x10006
+; GFX6-NEXT:    s_bfe_u32 s19, s2, 0x10008
+; GFX6-NEXT:    s_bfe_u32 s21, s2, 0x1000a
+; GFX6-NEXT:    s_bfe_u32 s23, s2, 0x1000c
+; GFX6-NEXT:    s_bfe_u32 s45, s2, 0x1000e
+; GFX6-NEXT:    s_bfe_u32 s46, s2, 0x10010
+; GFX6-NEXT:    s_bfe_u32 s47, s2, 0x10012
+; GFX6-NEXT:    s_bfe_u32 s48, s2, 0x10014
+; GFX6-NEXT:    s_bfe_u32 s49, s2, 0x10016
 ; GFX6-NEXT:    s_bfe_u32 s50, s2, 0x10018
 ; GFX6-NEXT:    s_bfe_u32 s51, s2, 0x1001a
 ; GFX6-NEXT:    s_bfe_u32 s52, s2, 0x1001c
 ; GFX6-NEXT:    s_bfe_u32 s53, s2, 0x1001e
+; GFX6-NEXT:    s_bfe_u32 s2, s3, 0x1001e
+; GFX6-NEXT:    s_bfe_u32 s29, s3, 0x10003
+; GFX6-NEXT:    s_bfe_u32 s30, s3, 0x10005
+; GFX6-NEXT:    s_bfe_u32 s31, s3, 0x10007
+; GFX6-NEXT:    s_bfe_u32 s33, s3, 0x10009
+; GFX6-NEXT:    s_bfe_u32 s34, s3, 0x1000b
+; GFX6-NEXT:    s_bfe_u32 s35, s3, 0x1000d
+; GFX6-NEXT:    s_bfe_u32 s36, s3, 0x1000f
+; GFX6-NEXT:    s_bfe_u32 s37, s3, 0x10011
+; GFX6-NEXT:    s_bfe_u32 s38, s3, 0x10013
+; GFX6-NEXT:    s_bfe_u32 s39, s3, 0x10015
+; GFX6-NEXT:    s_bfe_u32 s40, s3, 0x10017
+; GFX6-NEXT:    s_bfe_u32 s41, s3, 0x10019
+; GFX6-NEXT:    s_bfe_u32 s42, s3, 0x1001b
+; GFX6-NEXT:    s_bfe_u32 s43, s3, 0x1001d
+; GFX6-NEXT:    s_bfe_u32 s9, s3, 0x10001
+; GFX6-NEXT:    s_and_b32 s11, s3, 1
 ; GFX6-NEXT:    s_bfe_u32 s54, s3, 0x10002
 ; GFX6-NEXT:    s_bfe_u32 s55, s3, 0x10004
 ; GFX6-NEXT:    s_bfe_u32 s56, s3, 0x10006
@@ -8833,138 +8835,135 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX6-NEXT:    s_bfe_u32 s64, s3, 0x10016
 ; GFX6-NEXT:    s_bfe_u32 s65, s3, 0x10018
 ; GFX6-NEXT:    s_bfe_u32 s66, s3, 0x1001a
-; GFX6-NEXT:    s_bfe_u32 s67, s3, 0x1001e
-; GFX6-NEXT:    s_bfe_u32 s68, s3, 0x1001c
+; GFX6-NEXT:    s_bfe_u32 s67, s3, 0x1001c
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s44
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mov_b32_e32 v3, v1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s67
-; GFX6-NEXT:    v_mov_b32_e32 v2, s49
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:496
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s68
-; GFX6-NEXT:    v_mov_b32_e32 v2, s48
+; GFX6-NEXT:    v_mov_b32_e32 v0, s67
+; GFX6-NEXT:    v_mov_b32_e32 v2, s43
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:480
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s66
-; GFX6-NEXT:    v_mov_b32_e32 v2, s47
+; GFX6-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:464
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s65
-; GFX6-NEXT:    v_mov_b32_e32 v2, s46
+; GFX6-NEXT:    v_mov_b32_e32 v2, s41
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:448
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s64
-; GFX6-NEXT:    v_mov_b32_e32 v2, s45
+; GFX6-NEXT:    v_mov_b32_e32 v2, s40
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s63
-; GFX6-NEXT:    v_mov_b32_e32 v2, s44
+; GFX6-NEXT:    v_mov_b32_e32 v2, s39
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:416
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s62
-; GFX6-NEXT:    v_mov_b32_e32 v2, s43
+; GFX6-NEXT:    v_mov_b32_e32 v2, s38
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s61
-; GFX6-NEXT:    v_mov_b32_e32 v2, s42
+; GFX6-NEXT:    v_mov_b32_e32 v2, s37
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s60
-; GFX6-NEXT:    v_mov_b32_e32 v2, s41
+; GFX6-NEXT:    v_mov_b32_e32 v2, s36
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s59
-; GFX6-NEXT:    v_mov_b32_e32 v2, s40
+; GFX6-NEXT:    v_mov_b32_e32 v2, s35
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s58
-; GFX6-NEXT:    v_mov_b32_e32 v2, s39
+; GFX6-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s57
-; GFX6-NEXT:    v_mov_b32_e32 v2, s38
+; GFX6-NEXT:    v_mov_b32_e32 v2, s33
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s56
-; GFX6-NEXT:    v_mov_b32_e32 v2, s37
+; GFX6-NEXT:    v_mov_b32_e32 v2, s31
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s55
-; GFX6-NEXT:    v_mov_b32_e32 v2, s36
+; GFX6-NEXT:    v_mov_b32_e32 v2, s30
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s54
-; GFX6-NEXT:    v_mov_b32_e32 v2, s35
+; GFX6-NEXT:    v_mov_b32_e32 v2, s29
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s53
-; GFX6-NEXT:    v_mov_b32_e32 v2, s34
+; GFX6-NEXT:    v_mov_b32_e32 v2, s28
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s52
-; GFX6-NEXT:    v_mov_b32_e32 v2, s31
+; GFX6-NEXT:    v_mov_b32_e32 v2, s27
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s51
-; GFX6-NEXT:    v_mov_b32_e32 v2, s29
+; GFX6-NEXT:    v_mov_b32_e32 v2, s26
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s50
-; GFX6-NEXT:    v_mov_b32_e32 v2, s27
+; GFX6-NEXT:    v_mov_b32_e32 v2, s25
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s33
-; GFX6-NEXT:    v_mov_b32_e32 v2, s25
+; GFX6-NEXT:    v_mov_b32_e32 v0, s49
+; GFX6-NEXT:    v_mov_b32_e32 v2, s24
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s30
-; GFX6-NEXT:    v_mov_b32_e32 v2, s23
+; GFX6-NEXT:    v_mov_b32_e32 v0, s48
+; GFX6-NEXT:    v_mov_b32_e32 v2, s22
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s28
-; GFX6-NEXT:    v_mov_b32_e32 v2, s21
+; GFX6-NEXT:    v_mov_b32_e32 v0, s47
+; GFX6-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s26
-; GFX6-NEXT:    v_mov_b32_e32 v2, s19
+; GFX6-NEXT:    v_mov_b32_e32 v0, s46
+; GFX6-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s24
-; GFX6-NEXT:    v_mov_b32_e32 v2, s17
+; GFX6-NEXT:    v_mov_b32_e32 v0, s45
+; GFX6-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s22
-; GFX6-NEXT:    v_mov_b32_e32 v2, s15
+; GFX6-NEXT:    v_mov_b32_e32 v0, s23
+; GFX6-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s20
-; GFX6-NEXT:    v_mov_b32_e32 v2, s13
+; GFX6-NEXT:    v_mov_b32_e32 v0, s21
+; GFX6-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s18
-; GFX6-NEXT:    v_mov_b32_e32 v2, s11
+; GFX6-NEXT:    v_mov_b32_e32 v0, s19
+; GFX6-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    v_mov_b32_e32 v0, s17
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s15
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s12
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s7
-; GFX6-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -9934,349 +9933,358 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
 ; GFX6-LABEL: constant_sextload_v64i1_to_v64i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
 ; GFX6-NEXT:    s_mov_b32 s7, 0
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s9, s7
-; GFX6-NEXT:    s_mov_b32 s11, s7
-; GFX6-NEXT:    s_mov_b32 s13, s7
-; GFX6-NEXT:    s_mov_b32 s17, s7
-; GFX6-NEXT:    s_mov_b32 s19, s7
+; GFX6-NEXT:    s_mov_b32 s43, s7
+; GFX6-NEXT:    s_mov_b32 s41, s7
+; GFX6-NEXT:    s_mov_b32 s39, s7
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshr_b32 s6, s5, 30
-; GFX6-NEXT:    s_lshr_b32 s8, s5, 28
-; GFX6-NEXT:    s_lshr_b32 s10, s5, 29
-; GFX6-NEXT:    s_lshr_b32 s12, s5, 26
-; GFX6-NEXT:    s_lshr_b32 s16, s5, 27
-; GFX6-NEXT:    s_mov_b32 s18, s5
-; GFX6-NEXT:    s_bfe_i64 s[14:15], s[4:5], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[44:45], s[18:19], 0x10000
-; GFX6-NEXT:    s_ashr_i32 s18, s5, 31
-; GFX6-NEXT:    s_bfe_i64 s[28:29], s[16:17], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[36:37], s[12:13], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[38:39], s[10:11], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[40:41], s[8:9], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[42:43], s[6:7], 0x10000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[10:11], 0x0
+; GFX6-NEXT:    s_mov_b32 s37, s7
 ; GFX6-NEXT:    s_mov_b32 s31, s7
-; GFX6-NEXT:    s_mov_b32 s35, s7
-; GFX6-NEXT:    s_mov_b32 s25, s7
 ; GFX6-NEXT:    s_mov_b32 s27, s7
-; GFX6-NEXT:    s_mov_b32 s21, s7
 ; GFX6-NEXT:    s_mov_b32 s23, s7
-; GFX6-NEXT:    v_mov_b32_e32 v4, s18
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_lshr_b32 s6, s5, 30
+; GFX6-NEXT:    s_lshr_b32 s42, s5, 28
+; GFX6-NEXT:    s_lshr_b32 s40, s5, 29
+; GFX6-NEXT:    s_lshr_b32 s38, s5, 26
+; GFX6-NEXT:    s_lshr_b32 s36, s5, 27
+; GFX6-NEXT:    s_lshr_b32 s30, s5, 25
+; GFX6-NEXT:    s_lshr_b32 s26, s5, 23
+; GFX6-NEXT:    s_lshr_b32 s22, s5, 21
+; GFX6-NEXT:    s_lshr_b32 s20, s5, 18
+; GFX6-NEXT:    s_mov_b32 s21, s7
+; GFX6-NEXT:    s_bfe_i64 s[44:45], s[6:7], 0x10000
+; GFX6-NEXT:    s_ashr_i32 s6, s5, 31
+; GFX6-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_mov_b32 s0, s8
+; GFX6-NEXT:    s_mov_b32 s1, s9
+; GFX6-NEXT:    s_lshr_b32 s18, s5, 19
+; GFX6-NEXT:    s_mov_b32 s19, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s44
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s45
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NEXT:    v_mov_b32_e32 v4, s42
+; GFX6-NEXT:    v_mov_b32_e32 v5, s43
+; GFX6-NEXT:    v_mov_b32_e32 v6, s40
+; GFX6-NEXT:    v_mov_b32_e32 v7, s41
+; GFX6-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX6-NEXT:    v_mov_b32_e32 v3, s6
+; GFX6-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX6-NEXT:    s_mov_b32 s35, s7
+; GFX6-NEXT:    s_mov_b32 s29, s7
+; GFX6-NEXT:    s_mov_b32 s25, s7
+; GFX6-NEXT:    s_lshr_b32 s16, s5, 16
+; GFX6-NEXT:    s_mov_b32 s17, s7
+; GFX6-NEXT:    s_mov_b32 s15, s7
+; GFX6-NEXT:    s_mov_b32 s13, s7
+; GFX6-NEXT:    s_mov_b32 s11, s7
+; GFX6-NEXT:    s_mov_b32 s9, s7
 ; GFX6-NEXT:    s_mov_b32 s45, s7
-; GFX6-NEXT:    v_mov_b32_e32 v6, s14
-; GFX6-NEXT:    v_mov_b32_e32 v7, s15
-; GFX6-NEXT:    s_mov_b32 s47, s7
-; GFX6-NEXT:    v_mov_b32_e32 v2, s42
-; GFX6-NEXT:    v_mov_b32_e32 v3, s43
 ; GFX6-NEXT:    s_mov_b32 s43, s7
-; GFX6-NEXT:    v_mov_b32_e32 v8, s40
-; GFX6-NEXT:    v_mov_b32_e32 v9, s41
 ; GFX6-NEXT:    s_mov_b32 s41, s7
-; GFX6-NEXT:    v_mov_b32_e32 v10, s38
-; GFX6-NEXT:    v_mov_b32_e32 v11, s39
+; GFX6-NEXT:    v_mov_b32_e32 v9, s39
 ; GFX6-NEXT:    s_mov_b32 s39, s7
-; GFX6-NEXT:    v_mov_b32_e32 v12, s36
-; GFX6-NEXT:    v_mov_b32_e32 v13, s37
-; GFX6-NEXT:    s_mov_b32 s15, s7
-; GFX6-NEXT:    v_mov_b32_e32 v14, s28
-; GFX6-NEXT:    v_mov_b32_e32 v15, s29
+; GFX6-NEXT:    v_mov_b32_e32 v11, s37
 ; GFX6-NEXT:    s_mov_b32 s37, s7
-; GFX6-NEXT:    s_lshr_b32 s30, s5, 24
-; GFX6-NEXT:    s_lshr_b32 s34, s5, 25
-; GFX6-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[28:29], s[30:31], 0x10000
-; GFX6-NEXT:    v_mov_b32_e32 v5, s18
-; GFX6-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:496
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, s28
-; GFX6-NEXT:    v_mov_b32_e32 v3, s29
-; GFX6-NEXT:    s_mov_b32 s29, s7
-; GFX6-NEXT:    v_mov_b32_e32 v4, s34
-; GFX6-NEXT:    v_mov_b32_e32 v5, s35
-; GFX6-NEXT:    s_lshr_b32 s24, s5, 22
-; GFX6-NEXT:    s_lshr_b32 s26, s5, 23
-; GFX6-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, s24
-; GFX6-NEXT:    v_mov_b32_e32 v9, s25
-; GFX6-NEXT:    s_mov_b32 s25, s7
-; GFX6-NEXT:    v_mov_b32_e32 v10, s26
-; GFX6-NEXT:    v_mov_b32_e32 v11, s27
+; GFX6-NEXT:    s_mov_b32 s47, s7
+; GFX6-NEXT:    v_mov_b32_e32 v15, s31
+; GFX6-NEXT:    s_mov_b32 s31, s7
+; GFX6-NEXT:    s_mov_b32 s49, s7
+; GFX6-NEXT:    v_mov_b32_e32 v19, s27
 ; GFX6-NEXT:    s_mov_b32 s27, s7
-; GFX6-NEXT:    s_lshr_b32 s20, s5, 20
-; GFX6-NEXT:    s_lshr_b32 s22, s5, 21
-; GFX6-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:496
+; GFX6-NEXT:    s_mov_b32 s51, s7
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v12, s20
-; GFX6-NEXT:    v_mov_b32_e32 v13, s21
-; GFX6-NEXT:    s_mov_b32 s35, s7
-; GFX6-NEXT:    v_mov_b32_e32 v14, s22
-; GFX6-NEXT:    v_mov_b32_e32 v15, s23
-; GFX6-NEXT:    s_mov_b32 s21, s7
+; GFX6-NEXT:    v_mov_b32_e32 v3, s23
 ; GFX6-NEXT:    s_mov_b32 s23, s7
-; GFX6-NEXT:    s_lshr_b32 s16, s5, 18
-; GFX6-NEXT:    s_lshr_b32 s18, s5, 19
-; GFX6-NEXT:    s_lshr_b32 s10, s5, 16
-; GFX6-NEXT:    s_lshr_b32 s12, s5, 17
-; GFX6-NEXT:    s_lshr_b32 s8, s5, 14
-; GFX6-NEXT:    s_lshr_b32 s44, s5, 15
-; GFX6-NEXT:    s_lshr_b32 s46, s5, 12
-; GFX6-NEXT:    s_lshr_b32 s42, s5, 13
-; GFX6-NEXT:    s_lshr_b32 s40, s5, 10
-; GFX6-NEXT:    s_lshr_b32 s38, s5, 11
-; GFX6-NEXT:    s_lshr_b32 s14, s5, 8
-; GFX6-NEXT:    s_lshr_b32 s36, s5, 9
-; GFX6-NEXT:    s_lshr_b32 s28, s5, 6
-; GFX6-NEXT:    s_lshr_b32 s30, s5, 7
-; GFX6-NEXT:    s_lshr_b32 s24, s5, 4
-; GFX6-NEXT:    s_lshr_b32 s26, s5, 5
-; GFX6-NEXT:    s_lshr_b32 s34, s5, 2
-; GFX6-NEXT:    s_lshr_b32 s20, s5, 3
-; GFX6-NEXT:    s_lshr_b32 s22, s5, 1
-; GFX6-NEXT:    s_bfe_i64 s[6:7], s[18:19], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:448
+; GFX6-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:480
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, s16
-; GFX6-NEXT:    v_mov_b32_e32 v3, s17
-; GFX6-NEXT:    s_lshr_b32 s16, s4, 30
-; GFX6-NEXT:    v_mov_b32_e32 v4, s6
-; GFX6-NEXT:    v_mov_b32_e32 v5, s7
-; GFX6-NEXT:    s_lshr_b32 s18, s4, 31
-; GFX6-NEXT:    s_bfe_i64 s[6:7], s[12:13], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:432
+; GFX6-NEXT:    v_mov_b32_e32 v5, s21
+; GFX6-NEXT:    s_mov_b32 s21, s7
+; GFX6-NEXT:    s_mov_b32 s53, s7
+; GFX6-NEXT:    s_bfe_i64 s[6:7], s[18:19], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s34, s5, 24
+; GFX6-NEXT:    s_lshr_b32 s14, s5, 17
+; GFX6-NEXT:    v_mov_b32_e32 v8, s38
+; GFX6-NEXT:    v_mov_b32_e32 v10, s36
+; GFX6-NEXT:    v_mov_b32_e32 v6, s6
+; GFX6-NEXT:    v_mov_b32_e32 v7, s7
+; GFX6-NEXT:    s_bfe_i64 s[6:7], s[16:17], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s12, s5, 14
+; GFX6-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX6-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:464
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, s10
-; GFX6-NEXT:    v_mov_b32_e32 v9, s11
-; GFX6-NEXT:    s_lshr_b32 s10, s4, 28
+; GFX6-NEXT:    v_mov_b32_e32 v8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v9, s7
+; GFX6-NEXT:    s_bfe_i64 s[6:7], s[14:15], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s28, s5, 22
+; GFX6-NEXT:    s_lshr_b32 s10, s5, 15
+; GFX6-NEXT:    v_mov_b32_e32 v12, s34
+; GFX6-NEXT:    v_mov_b32_e32 v13, s35
+; GFX6-NEXT:    v_mov_b32_e32 v14, s30
 ; GFX6-NEXT:    v_mov_b32_e32 v10, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v11, s7
-; GFX6-NEXT:    s_lshr_b32 s12, s4, 29
-; GFX6-NEXT:    s_bfe_i64 s[6:7], s[44:45], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:416
+; GFX6-NEXT:    s_bfe_i64 s[6:7], s[12:13], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s8, s5, 12
+; GFX6-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX6-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:448
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v12, s8
-; GFX6-NEXT:    v_mov_b32_e32 v13, s9
-; GFX6-NEXT:    s_lshr_b32 s8, s4, 26
+; GFX6-NEXT:    v_mov_b32_e32 v12, s6
+; GFX6-NEXT:    v_mov_b32_e32 v13, s7
+; GFX6-NEXT:    s_bfe_i64 s[6:7], s[10:11], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s24, s5, 20
+; GFX6-NEXT:    s_lshr_b32 s44, s5, 13
+; GFX6-NEXT:    v_mov_b32_e32 v16, s28
+; GFX6-NEXT:    v_mov_b32_e32 v17, s29
+; GFX6-NEXT:    v_mov_b32_e32 v18, s26
 ; GFX6-NEXT:    v_mov_b32_e32 v14, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v15, s7
-; GFX6-NEXT:    s_lshr_b32 s44, s4, 27
+; GFX6-NEXT:    s_bfe_i64 s[6:7], s[8:9], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s42, s5, 10
+; GFX6-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX6-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:432
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v16, s6
+; GFX6-NEXT:    v_mov_b32_e32 v17, s7
+; GFX6-NEXT:    s_bfe_i64 s[6:7], s[44:45], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s40, s5, 11
+; GFX6-NEXT:    v_mov_b32_e32 v0, s24
+; GFX6-NEXT:    v_mov_b32_e32 v1, s25
+; GFX6-NEXT:    v_mov_b32_e32 v2, s22
+; GFX6-NEXT:    v_mov_b32_e32 v18, s6
+; GFX6-NEXT:    v_mov_b32_e32 v19, s7
 ; GFX6-NEXT:    s_bfe_i64 s[6:7], s[42:43], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[42:43], s[46:47], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:400
+; GFX6-NEXT:    s_lshr_b32 s38, s5, 8
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:416
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s7
+; GFX6-NEXT:    s_bfe_i64 s[6:7], s[40:41], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s36, s5, 9
+; GFX6-NEXT:    v_mov_b32_e32 v4, s20
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NEXT:    v_mov_b32_e32 v3, s7
+; GFX6-NEXT:    s_bfe_i64 s[6:7], s[38:39], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s46, s5, 6
+; GFX6-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:400
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, s42
-; GFX6-NEXT:    v_mov_b32_e32 v3, s43
-; GFX6-NEXT:    s_lshr_b32 s42, s4, 24
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s7
-; GFX6-NEXT:    s_lshr_b32 s46, s4, 25
-; GFX6-NEXT:    s_bfe_i64 s[6:7], s[38:39], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[38:39], s[40:41], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[6:7], s[36:37], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s30, s5, 7
+; GFX6-NEXT:    v_mov_b32_e32 v6, s6
+; GFX6-NEXT:    v_mov_b32_e32 v7, s7
+; GFX6-NEXT:    s_bfe_i64 s[6:7], s[46:47], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s48, s5, 4
+; GFX6-NEXT:    s_lshr_b32 s26, s5, 5
+; GFX6-NEXT:    s_lshr_b32 s12, s4, 29
 ; GFX6-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:384
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, s38
-; GFX6-NEXT:    v_mov_b32_e32 v9, s39
-; GFX6-NEXT:    s_lshr_b32 s38, s4, 22
+; GFX6-NEXT:    v_mov_b32_e32 v8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v9, s7
+; GFX6-NEXT:    s_bfe_i64 s[6:7], s[30:31], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s24, s4, 26
 ; GFX6-NEXT:    v_mov_b32_e32 v10, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v11, s7
-; GFX6-NEXT:    s_lshr_b32 s40, s4, 23
-; GFX6-NEXT:    s_bfe_i64 s[6:7], s[36:37], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[8:9], s[48:49], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[10:11], s[26:27], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s50, s5, 2
+; GFX6-NEXT:    s_lshr_b32 s22, s5, 3
+; GFX6-NEXT:    s_lshr_b32 s28, s4, 27
 ; GFX6-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:368
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v12, s14
-; GFX6-NEXT:    v_mov_b32_e32 v13, s15
-; GFX6-NEXT:    s_lshr_b32 s14, s4, 20
-; GFX6-NEXT:    v_mov_b32_e32 v14, s6
-; GFX6-NEXT:    v_mov_b32_e32 v15, s7
-; GFX6-NEXT:    s_lshr_b32 s6, s4, 21
-; GFX6-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:352
-; GFX6-NEXT:    v_mov_b32_e32 v16, s28
-; GFX6-NEXT:    v_mov_b32_e32 v17, s29
-; GFX6-NEXT:    s_lshr_b32 s28, s4, 18
-; GFX6-NEXT:    v_mov_b32_e32 v18, s30
-; GFX6-NEXT:    v_mov_b32_e32 v19, s31
-; GFX6-NEXT:    s_lshr_b32 s30, s4, 19
-; GFX6-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:336
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, s24
-; GFX6-NEXT:    v_mov_b32_e32 v9, s25
-; GFX6-NEXT:    s_lshr_b32 s24, s4, 16
-; GFX6-NEXT:    v_mov_b32_e32 v10, s26
-; GFX6-NEXT:    v_mov_b32_e32 v11, s27
-; GFX6-NEXT:    s_lshr_b32 s26, s4, 17
-; GFX6-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:320
+; GFX6-NEXT:    v_mov_b32_e32 v12, s8
+; GFX6-NEXT:    v_mov_b32_e32 v13, s9
+; GFX6-NEXT:    v_mov_b32_e32 v14, s10
+; GFX6-NEXT:    v_mov_b32_e32 v15, s11
+; GFX6-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:304
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v12, s34
-; GFX6-NEXT:    v_mov_b32_e32 v13, s35
-; GFX6-NEXT:    s_lshr_b32 s34, s4, 14
-; GFX6-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX6-NEXT:    v_mov_b32_e32 v10, s12
+; GFX6-NEXT:    v_mov_b32_e32 v11, s13
+; GFX6-NEXT:    s_bfe_i64 s[12:13], s[24:25], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s34, s4, 24
+; GFX6-NEXT:    s_bfe_i64 s[26:27], s[50:51], 0x10000
 ; GFX6-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
-; GFX6-NEXT:    v_mov_b32_e32 v14, s20
-; GFX6-NEXT:    v_mov_b32_e32 v15, s21
-; GFX6-NEXT:    s_lshr_b32 s20, s4, 15
-; GFX6-NEXT:    v_mov_b32_e32 v2, s22
-; GFX6-NEXT:    v_mov_b32_e32 v3, s23
-; GFX6-NEXT:    s_lshr_b32 s22, s4, 12
-; GFX6-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:304
+; GFX6-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:288
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v16, s16
-; GFX6-NEXT:    v_mov_b32_e32 v17, s17
-; GFX6-NEXT:    s_lshr_b32 s16, s4, 13
-; GFX6-NEXT:    v_mov_b32_e32 v18, s18
-; GFX6-NEXT:    v_mov_b32_e32 v19, s19
-; GFX6-NEXT:    s_lshr_b32 s18, s4, 10
-; GFX6-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:288
+; GFX6-NEXT:    v_mov_b32_e32 v12, s12
+; GFX6-NEXT:    v_mov_b32_e32 v13, s13
+; GFX6-NEXT:    s_bfe_i64 s[12:13], s[28:29], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s20, s5, 1
+; GFX6-NEXT:    s_mov_b32 s52, s5
+; GFX6-NEXT:    s_lshr_b32 s42, s4, 25
+; GFX6-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:352
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, s10
-; GFX6-NEXT:    v_mov_b32_e32 v9, s11
-; GFX6-NEXT:    s_lshr_b32 s10, s4, 11
-; GFX6-NEXT:    v_mov_b32_e32 v10, s12
-; GFX6-NEXT:    v_mov_b32_e32 v11, s13
-; GFX6-NEXT:    s_lshr_b32 s12, s4, 8
-; GFX6-NEXT:    s_bfe_i64 s[36:37], s[44:45], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:272
+; GFX6-NEXT:    v_mov_b32_e32 v16, s26
+; GFX6-NEXT:    v_mov_b32_e32 v17, s27
+; GFX6-NEXT:    v_mov_b32_e32 v18, s22
+; GFX6-NEXT:    v_mov_b32_e32 v19, s23
+; GFX6-NEXT:    v_mov_b32_e32 v14, s12
+; GFX6-NEXT:    v_mov_b32_e32 v15, s13
+; GFX6-NEXT:    s_bfe_i64 s[12:13], s[34:35], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s40, s4, 22
+; GFX6-NEXT:    s_bfe_i64 s[30:31], s[52:53], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX6-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:272
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v12, s8
-; GFX6-NEXT:    v_mov_b32_e32 v13, s9
-; GFX6-NEXT:    s_lshr_b32 s8, s4, 9
-; GFX6-NEXT:    v_mov_b32_e32 v14, s36
-; GFX6-NEXT:    v_mov_b32_e32 v15, s37
-; GFX6-NEXT:    s_lshr_b32 s36, s4, 6
-; GFX6-NEXT:    s_bfe_i64 s[44:45], s[46:47], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX6-NEXT:    v_mov_b32_e32 v16, s12
+; GFX6-NEXT:    v_mov_b32_e32 v17, s13
+; GFX6-NEXT:    s_bfe_i64 s[12:13], s[42:43], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s18, s4, 30
+; GFX6-NEXT:    s_lshr_b32 s16, s4, 31
+; GFX6-NEXT:    s_lshr_b32 s38, s4, 23
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s30
+; GFX6-NEXT:    v_mov_b32_e32 v1, s31
+; GFX6-NEXT:    v_mov_b32_e32 v2, s20
+; GFX6-NEXT:    v_mov_b32_e32 v3, s21
+; GFX6-NEXT:    v_mov_b32_e32 v18, s12
+; GFX6-NEXT:    v_mov_b32_e32 v19, s13
+; GFX6-NEXT:    s_bfe_i64 s[12:13], s[40:41], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s14, s4, 28
+; GFX6-NEXT:    s_lshr_b32 s36, s4, 20
+; GFX6-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s42
-; GFX6-NEXT:    v_mov_b32_e32 v1, s43
-; GFX6-NEXT:    s_lshr_b32 s42, s4, 7
-; GFX6-NEXT:    v_mov_b32_e32 v2, s44
-; GFX6-NEXT:    v_mov_b32_e32 v3, s45
-; GFX6-NEXT:    s_lshr_b32 s44, s4, 4
-; GFX6-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240
+; GFX6-NEXT:    v_mov_b32_e32 v0, s12
+; GFX6-NEXT:    v_mov_b32_e32 v1, s13
+; GFX6-NEXT:    s_bfe_i64 s[12:13], s[38:39], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s44, s4, 21
+; GFX6-NEXT:    s_lshr_b32 s6, s4, 18
+; GFX6-NEXT:    s_lshr_b32 s8, s4, 19
+; GFX6-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:320
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v16, s38
-; GFX6-NEXT:    v_mov_b32_e32 v17, s39
-; GFX6-NEXT:    s_lshr_b32 s38, s4, 5
-; GFX6-NEXT:    v_mov_b32_e32 v18, s40
-; GFX6-NEXT:    v_mov_b32_e32 v19, s41
-; GFX6-NEXT:    s_lshr_b32 s40, s4, 2
+; GFX6-NEXT:    v_mov_b32_e32 v4, s18
+; GFX6-NEXT:    v_mov_b32_e32 v5, s19
+; GFX6-NEXT:    v_mov_b32_e32 v6, s16
+; GFX6-NEXT:    v_mov_b32_e32 v7, s17
 ; GFX6-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX6-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224
-; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v2, s12
+; GFX6-NEXT:    v_mov_b32_e32 v3, s13
+; GFX6-NEXT:    s_bfe_i64 s[12:13], s[36:37], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s10, s4, 16
+; GFX6-NEXT:    s_lshr_b32 s26, s4, 17
 ; GFX6-NEXT:    v_mov_b32_e32 v8, s14
 ; GFX6-NEXT:    v_mov_b32_e32 v9, s15
-; GFX6-NEXT:    s_lshr_b32 s14, s4, 3
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
-; GFX6-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX6-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v4, s12
+; GFX6-NEXT:    v_mov_b32_e32 v5, s13
+; GFX6-NEXT:    s_bfe_i64 s[12:13], s[44:45], 0x10000
 ; GFX6-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
-; GFX6-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
 ; GFX6-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX6-NEXT:    s_lshr_b32 s22, s4, 14
+; GFX6-NEXT:    s_lshr_b32 s30, s4, 15
+; GFX6-NEXT:    v_mov_b32_e32 v6, s12
+; GFX6-NEXT:    v_mov_b32_e32 v7, s13
+; GFX6-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX6-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224
 ; GFX6-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GFX6-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
-; GFX6-NEXT:    v_mov_b32_e32 v10, s6
-; GFX6-NEXT:    v_mov_b32_e32 v11, s7
-; GFX6-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160
-; GFX6-NEXT:    s_waitcnt expcnt(2)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s28
-; GFX6-NEXT:    v_mov_b32_e32 v1, s29
-; GFX6-NEXT:    v_mov_b32_e32 v2, s30
-; GFX6-NEXT:    v_mov_b32_e32 v3, s31
+; GFX6-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:192
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
+; GFX6-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:160
+; GFX6-NEXT:    s_waitcnt expcnt(1)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s7
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    v_mov_b32_e32 v3, s9
+; GFX6-NEXT:    s_lshr_b32 s20, s4, 12
+; GFX6-NEXT:    s_lshr_b32 s18, s4, 13
+; GFX6-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s24
-; GFX6-NEXT:    v_mov_b32_e32 v1, s25
+; GFX6-NEXT:    v_mov_b32_e32 v0, s10
+; GFX6-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s26
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s27
+; GFX6-NEXT:    s_lshr_b32 s46, s4, 10
+; GFX6-NEXT:    s_lshr_b32 s48, s4, 11
+; GFX6-NEXT:    s_lshr_b32 s42, s4, 4
+; GFX6-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s34
-; GFX6-NEXT:    v_mov_b32_e32 v1, s35
-; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    v_mov_b32_e32 v3, s21
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s22
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s23
-; GFX6-NEXT:    v_mov_b32_e32 v2, s16
-; GFX6-NEXT:    v_mov_b32_e32 v3, s17
+; GFX6-NEXT:    v_mov_b32_e32 v2, s30
+; GFX6-NEXT:    v_mov_b32_e32 v3, s31
+; GFX6-NEXT:    s_lshr_b32 s50, s4, 8
+; GFX6-NEXT:    s_lshr_b32 s24, s4, 9
+; GFX6-NEXT:    s_lshr_b32 s40, s4, 5
+; GFX6-NEXT:    s_lshr_b32 s36, s4, 1
+; GFX6-NEXT:    s_bfe_i64 s[38:39], s[42:43], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[42:43], s[48:49], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[44:45], s[46:47], 0x10000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s20
+; GFX6-NEXT:    v_mov_b32_e32 v1, s21
+; GFX6-NEXT:    v_mov_b32_e32 v2, s18
+; GFX6-NEXT:    v_mov_b32_e32 v3, s19
+; GFX6-NEXT:    s_lshr_b32 s28, s4, 6
+; GFX6-NEXT:    s_lshr_b32 s34, s4, 7
+; GFX6-NEXT:    s_lshr_b32 s16, s4, 2
+; GFX6-NEXT:    s_lshr_b32 s14, s4, 3
+; GFX6-NEXT:    s_bfe_i64 s[12:13], s[4:5], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[4:5], s[36:37], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[36:37], s[40:41], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[40:41], s[50:51], 0x10000
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s18
-; GFX6-NEXT:    v_mov_b32_e32 v1, s19
-; GFX6-NEXT:    v_mov_b32_e32 v2, s10
-; GFX6-NEXT:    v_mov_b32_e32 v3, s11
+; GFX6-NEXT:    v_mov_b32_e32 v0, s44
+; GFX6-NEXT:    v_mov_b32_e32 v1, s45
+; GFX6-NEXT:    v_mov_b32_e32 v2, s42
+; GFX6-NEXT:    v_mov_b32_e32 v3, s43
+; GFX6-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s12
-; GFX6-NEXT:    v_mov_b32_e32 v1, s13
-; GFX6-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NEXT:    v_mov_b32_e32 v3, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s40
+; GFX6-NEXT:    v_mov_b32_e32 v1, s41
+; GFX6-NEXT:    v_mov_b32_e32 v2, s24
+; GFX6-NEXT:    v_mov_b32_e32 v3, s25
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s36
-; GFX6-NEXT:    v_mov_b32_e32 v1, s37
-; GFX6-NEXT:    v_mov_b32_e32 v2, s42
-; GFX6-NEXT:    v_mov_b32_e32 v3, s43
+; GFX6-NEXT:    v_mov_b32_e32 v0, s28
+; GFX6-NEXT:    v_mov_b32_e32 v1, s29
+; GFX6-NEXT:    v_mov_b32_e32 v2, s34
+; GFX6-NEXT:    v_mov_b32_e32 v3, s35
+; GFX6-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX6-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s44
-; GFX6-NEXT:    v_mov_b32_e32 v1, s45
-; GFX6-NEXT:    v_mov_b32_e32 v2, s38
-; GFX6-NEXT:    v_mov_b32_e32 v3, s39
+; GFX6-NEXT:    v_mov_b32_e32 v0, s38
+; GFX6-NEXT:    v_mov_b32_e32 v1, s39
+; GFX6-NEXT:    v_mov_b32_e32 v2, s36
+; GFX6-NEXT:    v_mov_b32_e32 v3, s37
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s40
-; GFX6-NEXT:    v_mov_b32_e32 v1, s41
+; GFX6-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s15
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GFX6-NEXT:    v_mov_b32_e32 v8, s4
-; GFX6-NEXT:    v_mov_b32_e32 v9, s5
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s12
+; GFX6-NEXT:    v_mov_b32_e32 v1, s13
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    v_mov_b32_e32 v3, s5
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 13408f1d89b25..0021af9381335 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN-NOHSA-SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefix=GCN-NOHSA-SI %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck --check-prefix=GCN-HSA %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GCN-NOHSA-VI %s
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG %s
@@ -185,10 +185,9 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NOHSA-SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_load_v3i16:
@@ -454,13 +453,12 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s2
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s3
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s3
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_load_v16i16:
@@ -561,40 +559,53 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:2
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 offset:4
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:14
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:10
 ; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v2, off, s[0:3], 0 offset:6
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v5, off, s[0:3], 0 offset:8
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 offset:10
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v6, off, s[0:3], 0 offset:12
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v7, off, s[0:3], 0 offset:14
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v8, off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v9, off, s[0:3], 0 offset:18
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v10, off, s[0:3], 0 offset:20
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v11, off, s[0:3], 0 offset:22
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v12, off, s[0:3], 0 offset:24
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v13, off, s[0:3], 0 offset:26
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v14, off, s[0:3], 0 offset:28
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v15, off, s[0:3], 0 offset:30
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 offset:2
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 offset:30
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v5, off, s[0:3], 0 offset:26
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v6, off, s[0:3], 0 offset:22
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v7, off, s[0:3], 0 offset:18
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v8, off, s[0:3], 0 offset:12
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v9, off, s[0:3], 0 offset:8
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v10, off, s[0:3], 0 offset:4
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v11, off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v12, off, s[0:3], 0 offset:28
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v13, off, s[0:3], 0 offset:24
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v14, off, s[0:3], 0 offset:20
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v15, off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(13)
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v2
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(12)
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(11)
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(10)
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(9)
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(8)
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v3, v0, v8
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v2, v1, v9
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(5)
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v1, v16, v10
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v0, v17, v11
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v7, v4, v12
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v6, v5, v13
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v5, v18, v14
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v3, v7, v6
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v2, v16, v5
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v1, v17, v4
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v0, v18, v0
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v7, v15, v14
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v6, v13, v12
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v5, v11, v10
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v4, v9, v8
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v4, v19, v15
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -1150,14 +1161,14 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GCN-NOHSA-SI-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s4, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s2, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s5, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -1238,14 +1249,14 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GCN-NOHSA-SI-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s4, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s5, s2
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s5, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s4, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -1330,14 +1341,13 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s6, s4, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s6, s4, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:8
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -1428,14 +1438,13 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s6, s4, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s5, s5
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s6, s4, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s4, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:8
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -1530,12 +1539,12 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s6, s5, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s7, s4, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
@@ -1639,12 +1648,12 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s6, s5, 16
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s7, s4, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s4, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
@@ -1751,13 +1760,13 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s8, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s9, s4, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s10, s7, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s11, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s8, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s9, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s11
@@ -1910,13 +1919,13 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s8, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s9, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s10, s7, 16
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s11, s6, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s7, s7
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s6, s6
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s8, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s9, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s4, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s11
@@ -2069,32 +2078,32 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s13, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s15, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s16, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s17, s8, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s11, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s16, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s17, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s19
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s18
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s15, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s17
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s13, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
@@ -2330,32 +2339,32 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s12, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s13, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s5, s5
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s4, s4
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s14, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s15, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s7, s7
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s6, s6
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s16, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s17, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s9, s9
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s18, s11, 16
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s19, s10, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s11, s11
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s10, s10
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s16, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s17, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s9, s9
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s8, s8
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s19
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s18
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s14, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s15, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s7, s7
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s6, s6
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s17
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s12, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s13, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s5, s5
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s4, s4
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
@@ -2593,38 +2602,22 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s15, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s1, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s0, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s3, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s10, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s13, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s31, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s14, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s35, s1, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s36, s0, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s37, s3, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s38, s2, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s13, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
@@ -2633,24 +2626,40 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s31
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s13
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s30
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s29
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s28
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s27
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s26
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
@@ -3068,6 +3077,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s37, s15, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s38, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s15, s15
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s14, s14
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s18, s1, 16
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s19, s0, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s20, s1
@@ -3076,30 +3089,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s23, s2, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s24, s3
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s25, s2
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s26, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s27, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s5, s5
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s4, s4
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s28, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s29, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s7, s7
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s6, s6
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s30, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s31, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s9, s9
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s8, s8
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s34, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s11, s11
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s10, s10
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s35, s13, 16
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s36, s12, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s13, s13
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s12, s12
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s37, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s38, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s15, s15
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s14, s14
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
@@ -3108,24 +3101,40 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s38
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s37
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s34, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s11, s11
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s10, s10
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s36
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s13
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s35
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s30, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s31, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s9, s9
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s8, s8
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s28, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s29, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s7, s7
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s6, s6
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s31
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s30
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s26, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s27, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s5, s5
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s4, s4
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
@@ -3551,100 +3560,76 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x10
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s35, s3, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s40, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s55, s3, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s56, s2, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s2, s22, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s3, s25, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s67, s31, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s68, s30, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s31, s31, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s30, s30, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s1, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s0, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s35, s3, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s38, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s41, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s42, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s45, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s46, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s47, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s48, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s49, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s50, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s51, s13, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s52, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s53, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s54, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s39, s1, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s40, s0, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s43, s3, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s44, s2, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s13, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s55, s17, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s56, s16, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s57, s19, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s58, s18, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s44, s1, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s46, s0, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s59, s21, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s60, s20, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s65, s29, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s66, s28, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s21, s21, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s20, s20, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s29, s29, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s28, s28, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s36
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s37
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s30
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s68
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s67
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s3
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s2
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s57, s19, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s58, s18, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s61, s23, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s62, s22, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s63, s25, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s64, s24, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s65, s27, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s66, s26, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s67, s29, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s68, s28, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s69, s31, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s70, s30, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s17, s17, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s62, s24, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s63, s27, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s64, s26, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s19, s19, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s18, s18, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s20, s20, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s23, s23, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s22, s22, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s25, s25, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s24, s24, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s27, s27, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s26, s26, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s29, s29, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s28, s28, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s31, s31, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s30, s30, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s21, s21, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s36
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s37
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s30
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s70
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s69
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s28
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s68
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s66
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s29
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s67
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s65
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s60
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s59
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s38, s17, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s39, s16, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s16, s16, 0xffff
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s26
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s66
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s64
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s27
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s65
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s63
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s64
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s62
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s25
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s63
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s62
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s23
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s20
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s61
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s60
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s59
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
@@ -3655,42 +3640,66 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s58
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s57
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s53, s15, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s54, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s56
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s39
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s55
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s38
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s51, s13, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s52, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s13, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s54
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s53
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s49, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s50, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s52
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s13
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s51
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s47, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s48, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s50
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s49
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s43, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s45, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s48
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s47
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s41, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s42, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s46
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s45
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s45
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s43
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
@@ -3699,15 +3708,15 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s41
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s44
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s38
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s43
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s56
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s40
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s55
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s35
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s40
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s46
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s34
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s44
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -4444,166 +4453,166 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[36:39], s[4:5], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x0
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x10
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x10
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s17, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s34, s16, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s17, s17
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s16, s16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s35, s19, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s38, s18, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s19, s19
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s18, s18
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s39, s21, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s40, s20, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s41, s3, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s42, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s43, s3
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s44, s2
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s2, s22, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s3, s23
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s67, s31, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s68, s30, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s31, s31
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s30, s30
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s1, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s34, s0, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s35, s1
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s40, s0
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s59, s21, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s60, s20, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s21, s21
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s20, s20
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s41, s23, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s42, s22, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s23, s23
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s22, s22
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s43, s25, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s44, s24, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s25, s25
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s24, s24
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s45, s27, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s46, s26, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s27, s27
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s26, s26
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s47, s29, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s48, s28, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s65, s29, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s66, s28, 16
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s29, s29
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s28, s28
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s49, s31, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s50, s30, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s31, s31
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s30, s30
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s51, s1, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s52, s0, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s53, s1
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s54, s0
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s55, s3, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s56, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s57, s3
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s58, s2
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s59, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s60, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s5, s5
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s4, s4
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s61, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s62, s7
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s6, s6
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s63, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s64, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s9, s9
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s8, s8
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s65, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s66, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s11, s11
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s10, s10
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s67, s13, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s68, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s13, s13
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s12, s12
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s69, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s70, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s15, s15
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s14, s14
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s7, s7, 16
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s36
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s37
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s30
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s68
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s67
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s2
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s3
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s70
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s69
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s68
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s13
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s67
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s66
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s65
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s64
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s63
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s61
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s62
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s57, s19, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s58, s18, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s19, s19
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s18, s18
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s61, s23, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s22, s22
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s23, s25, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s62, s24, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s25, s25
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s24, s24
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s63, s27, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s64, s26, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s27, s27
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s26, s26
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s28
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s66
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s65
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s20
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s60
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s59
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s55, s17, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s56, s16, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s17, s17
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s16, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s26
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s64
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s27
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s63
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s62
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s61
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s58
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s58
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s57
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s38, s15, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s39, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s15, s15
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s14, s14
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s16
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s56
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s57
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s17
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s55
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s53, s13, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s54, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s13, s13
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s12, s12
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s38
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s51, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s52, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s11, s11
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s10, s10
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s53
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s49, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s50, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s9, s9
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s8, s8
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s52
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s53
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s51
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s47, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s48, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s7, s7
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s6, s6
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s30
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s50
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s49
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s45, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s46, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s5, s5
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i16 s4, s4
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s28
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s48
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s47
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s26
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s46
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s27
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s45
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s44
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s25
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s43
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s44
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s42
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s43
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s41
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s40
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s39
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s38
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s35
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s40
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s34
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s35
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -5753,18 +5762,20 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v2i16_to_v2i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s4, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s2, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s0, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s1, s2, 0xffff
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64:
@@ -5850,17 +5861,17 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GCN-NOHSA-SI-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s4, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[2:3], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s6, s4, 16
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -5950,24 +5961,26 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v4i16_to_v4i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s6, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s7, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
+; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s0, s3, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s3, s3, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s1, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s2, s2, 0xffff
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64:
@@ -6083,30 +6096,32 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v4i16_to_v4i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, 0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, s5
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s8, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[4:5], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s9, s5, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s12, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[6:7], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[8:9], 0x100000
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s9
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s3
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s0, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[2:3], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s12, s3, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s13, s3, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[2:3], s[8:9], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x100000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s12
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s7
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64:
@@ -6237,27 +6252,27 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s8, s5, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s9, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s10, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s11, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s8, s5, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s10, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s11, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
@@ -6452,40 +6467,41 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s7
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s5
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s9, s5, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s11, s5, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s6, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s4, 16
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[4:5], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[6:7], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s13, s5, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s15, s5, 16
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[10:11], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s20, s7, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s21, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s10, s7, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s20, s7, 16
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[8:9], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[14:15], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[12:13], 0x100000
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s10
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s9
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x100000
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s13
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s9
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64:
@@ -6693,61 +6709,61 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, -1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s1, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s13, s3, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s15, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s16, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s17, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s0, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s0, s0, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s2, s2, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s1, s1, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s3, s3, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s15, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s13, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s14
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s13
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s16, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s12
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s17, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s17
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s18
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64:
@@ -7062,6 +7078,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s14, s5
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s16, s3
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s18, s1
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s13, s1, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s15, s1, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s17, s3, 31
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s6, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s4, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s2, 16
@@ -7070,66 +7089,66 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[30:31], s[2:3], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[34:35], s[4:5], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[6:7], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s21, s1, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s23, s1, 16
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[0:1], s[18:19], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s25, s3, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s27, s3, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s18, s3, 16
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[2:3], s[16:17], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s5, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s38, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s16, s5, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s19, s5, 16
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[14:15], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s39, s7, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s40, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s14, s7, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s7, 16
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[12:13], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[26:27], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[24:25], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[22:23], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[20:21], 0x100000
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s40
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s14
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s38
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s27
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s17
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x100000
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s23
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s13
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x100000
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s36
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s37
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s34
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s35
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s30
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s28
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s29
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s18
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s21
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x100000
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s17
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s13
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s35
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s23
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s30
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s25
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s28
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s26
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s27
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64:
@@ -7498,61 +7517,49 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s15, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s1, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s3, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s11, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s13, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s31, s4, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s2, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s0, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s35, s0, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s36, s2, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s37, s1, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s38, s3, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s13, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s17
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s15
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s25
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s13
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s24
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s23
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
@@ -7565,26 +7572,38 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s37
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s18
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s26
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s27
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s28
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s29
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s31, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
@@ -8172,149 +8191,155 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s18, s15
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s20, s13
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s24, s11
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s26, s9
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s22, s7
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s42, s15
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s43, s11, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s25, s3, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s27, s3, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s45, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s44, s13
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s54, s3
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s36, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[2:3], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[2:3], s[42:43], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s47, s13, 31
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s46, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[2:3], s[44:45], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s49, s13, 16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s48, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[2:3], s[46:47], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s51, s15, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s53, s15, 16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s50, s7
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s52, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[2:3], s[48:49], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s1, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s39, s1, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s41, s3, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s43, s3, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s23, s5, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s27, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s29, s7, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s31, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s35, s9, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s37, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[46:47], s[26:27], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s55, s11, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s57, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[48:49], s[24:25], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[50:51], s[20:21], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[52:53], s[18:19], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s58, s13, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s59, s13, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s60, s15, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s61, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s54, s5
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s44, s3
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s56, s1
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s36, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s38, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s40, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s42, s0, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s23, s1, 16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s40, s1
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s38, s0, 16
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[0:1], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[2:3], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[14:15], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s52
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s53
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s50
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s51
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s48
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s49
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s46
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s47
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s2
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[2:3], s[50:51], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[52:53], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s29, s5, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s31, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s14, 16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s2
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s3
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s53
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s51
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s49
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s47
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s16
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[22:23], 0x100000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s17
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[54:55], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[46:47], s[56:57], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x100000
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s15
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s61
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s60
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s59
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s58
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s57
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s55
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s37
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s35
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s29
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x100000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s29
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s27
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s23
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[42:43], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[40:41], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[38:39], 0x100000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s27
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s25
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s23
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s23
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s25
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s39, s9, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s41, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s26
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s27
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s35, s7, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s37, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x100000
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(5)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s44
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s45
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s43
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s41
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s46
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s47
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s28
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s29
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s35
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s25
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s13
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v24, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s30
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s31
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s37
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[40:41], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s35
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s17
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[38:39], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s36
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s37
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s45
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s43
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s41
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s33
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s36
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s37
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s34
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s35
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s28
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s29
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s26
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s27
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s23
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v26, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, s17
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s17
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index 34828aa418bc5..2e862cee733f4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GFX6-NOHSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6-NOHSA %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GFX7-HSA %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8-NOHSA %s
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
@@ -209,11 +209,10 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NOHSA-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NOHSA-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:8
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
@@ -430,13 +429,12 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s2
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s3
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
 ; GFX7-HSA-LABEL: constant_load_v8i32:
@@ -570,26 +568,28 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX6-NOHSA-LABEL: constant_load_v9i32:
 ; GFX6-NOHSA:       ; %bb.0: ; %entry
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX6-NOHSA-NEXT:    s_mov_b32 s15, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s14, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dword s12, s[10:11], 0x8
+; GFX6-NOHSA-NEXT:    s_load_dword s16, s[10:11], 0x8
 ; GFX6-NOHSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s11, 0xf000
-; GFX6-NOHSA-NEXT:    s_mov_b32 s10, -1
+; GFX6-NOHSA-NEXT:    s_mov_b32 s12, s8
+; GFX6-NOHSA-NEXT:    s_mov_b32 s13, s9
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s12
-; GFX6-NOHSA-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:32
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NOHSA-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:32
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s7
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s3
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
 ; GFX7-HSA-LABEL: constant_load_v9i32:
@@ -750,27 +750,29 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
 ; GFX6-NOHSA-LABEL: constant_load_v10i32:
 ; GFX6-NOHSA:       ; %bb.0: ; %entry
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX6-NOHSA-NEXT:    s_mov_b32 s15, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s14, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dwordx2 s[12:13], s[10:11], 0x8
+; GFX6-NOHSA-NEXT:    s_load_dwordx2 s[16:17], s[10:11], 0x8
 ; GFX6-NOHSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s11, 0xf000
-; GFX6-NOHSA-NEXT:    s_mov_b32 s10, -1
+; GFX6-NOHSA-NEXT:    s_mov_b32 s12, s8
+; GFX6-NOHSA-NEXT:    s_mov_b32 s13, s9
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s12
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s13
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0 offset:32
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s16
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s17
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx2 v[3:4], off, s[12:15], 0 offset:32
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s7
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s3
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
 ; GFX7-HSA-LABEL: constant_load_v10i32:
@@ -935,18 +937,19 @@ entry:
 define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GFX6-NOHSA-LABEL: constant_load_v11i32:
 ; GFX6-NOHSA:       ; %bb.0: ; %entry
-; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[12:15], s[10:11], 0x8
-; GFX6-NOHSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s14
-; GFX6-NOHSA-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:40
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[12:15], s[2:3], 0x8
+; GFX6-NOHSA-NEXT:    s_mov_b32 s8, s0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s9, s1
+; GFX6-NOHSA-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x0
+; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s13
+; GFX6-NOHSA-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:40
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0 offset:32
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
@@ -1131,29 +1134,30 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
 ; GFX6-NOHSA-LABEL: constant_load_v12i32:
 ; GFX6-NOHSA:       ; %bb.0: ; %entry
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX6-NOHSA-NEXT:    s_mov_b32 s15, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s14, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[12:15], s[10:11], 0x8
+; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[16:19], s[10:11], 0x8
 ; GFX6-NOHSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s11, 0xf000
-; GFX6-NOHSA-NEXT:    s_mov_b32 s10, -1
+; GFX6-NOHSA-NEXT:    s_mov_b32 s12, s8
+; GFX6-NOHSA-NEXT:    s_mov_b32 s13, s9
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s12
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s13
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s14
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s15
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s7
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s17
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s18
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s19
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s6
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:32
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s3
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
 ; GFX7-HSA-LABEL: constant_load_v12i32:
@@ -1339,22 +1343,21 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s15
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s9
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s10
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s11
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s8
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s9
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s10
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s11
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:32
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
@@ -1555,11 +1558,11 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
 ; GFX6-NOHSA-LABEL: constant_zextload_i32_to_i64:
 ; GFX6-NOHSA:       ; %bb.0:
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1655,9 +1658,9 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s5, s4, 31
-; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1760,11 +1763,11 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
 ; GFX6-NOHSA-LABEL: constant_zextload_v1i32_to_v1i64:
 ; GFX6-NOHSA:       ; %bb.0:
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1860,9 +1863,9 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s5, s4, 31
-; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1965,16 +1968,18 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX6-NOHSA-LABEL: constant_zextload_v2i32_to_v2i64:
 ; GFX6-NOHSA:       ; %bb.0:
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
+; GFX6-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX6-NOHSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
 ; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64:
@@ -2080,10 +2085,10 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s6, s5, 31
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s7, s4, 31
-; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s7
@@ -2206,20 +2211,22 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; GFX6-NOHSA-LABEL: constant_zextload_v4i32_to_v4i64:
 ; GFX6-NOHSA:       ; %bb.0:
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
+; GFX6-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s11
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s9
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
 ; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64:
@@ -2357,20 +2364,21 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s10, s7, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s11, s6, 31
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s8, s5, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s9, s7, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s10, s6, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s11, s4, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s9, s4, 31
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s5
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s10
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s9
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s11
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s8
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s9
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s8
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64:
@@ -2534,28 +2542,30 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; GFX6-NOHSA-LABEL: constant_zextload_v8i32_to_v8i64:
 ; GFX6-NOHSA:       ; %bb.0:
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s10, -1
+; GFX6-NOHSA-NEXT:    s_mov_b32 s15, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s14, -1
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NOHSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s12, s8
+; GFX6-NOHSA-NEXT:    s_mov_b32 s13, s9
+; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:48
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:32
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s3
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
 ; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64:
@@ -2751,40 +2761,43 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GFX6-NOHSA-LABEL: constant_sextload_v8i32_to_v8i64:
 ; GFX6-NOHSA:       ; %bb.0:
-; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s11, 0xf000
-; GFX6-NOHSA-NEXT:    s_mov_b32 s10, -1
+; GFX6-NOHSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s12, s1, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s13, s0, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s14, s3, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s15, s2, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s16, s5, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s17, s7, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s18, s6, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s19, s4, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s18, s11, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s19, s10, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s16, s9, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s17, s8, 31
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s19
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s11
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s18
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s14, s7, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s15, s6, 31
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s17
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s9
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s16
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s12, s5, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s13, s4, 31
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s5
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s2
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s3
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s0
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s18
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s17
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s19
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s16
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:32
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s15
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s14
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:16
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s13
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s12
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[8:11], 0
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s14
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s13
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s12
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64:
@@ -3053,65 +3066,69 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s19, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s18, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s20, s1, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s21, s0, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s22, s3, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s23, s2, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s24, s5, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s25, s4, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s26, s7, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s27, s6, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s28, s9, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s29, s8, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s30, s11, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s31, s10, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s35, s15, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s36, s14, 31
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s33, s13, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s34, s15, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s35, s14, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s36, s12, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s34, s12, 31
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s36
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s15
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s12
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s13
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s10
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s11
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s8
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s9
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v16, s6
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v18, s7
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s35
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s34
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s35
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s30, s11, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s31, s10, 31
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s36
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s33
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:96
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s12
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s34
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s13
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s33
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s28, s9, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s29, s8, 31
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s31
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s11
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s30
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s26, s7, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s27, s6, 31
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s29
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s9
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s28
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s24, s5, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s25, s4, 31
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s3
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s30
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:80
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s27
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s26
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s22, s3, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s23, s2, 31
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s0
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s29
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s28
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[16:19], 0 offset:64
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s27
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v19, s26
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[16:19], 0 offset:48
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s25
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s24
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s20, s1, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s21, s0, 31
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s23
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s22
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s21
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s20
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[16:19], 0
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s23
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s22
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s21
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s1
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s20
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64:
@@ -3567,14 +3584,16 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
 define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GFX6-NOHSA-LABEL: constant_zextload_v16i32_to_v16i64:
 ; GFX6-NOHSA:       ; %bb.0:
-; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x9
-; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s19, 0xf000
+; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[20:23], s[4:5], 0x9
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s19, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s18, -1
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NOHSA-NEXT:    s_load_dwordx16 s[0:15], s[22:23], 0x0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s16, s20
+; GFX6-NOHSA-NEXT:    s_mov_b32 s17, s21
+; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s14
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s15
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112
@@ -3936,134 +3955,135 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX6-NOHSA-LABEL: constant_sextload_v32i32_to_v32i64:
 ; GFX6-NOHSA:       ; %bb.0:
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dwordx16 s[16:31], s[2:3], 0x10
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s38, -1
+; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NOHSA-NEXT:    s_load_dwordx16 s[16:31], s[2:3], 0x10
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s36, s0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s37, s1
 ; GFX6-NOHSA-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s33, s17, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s34, s16, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s35, s19, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s40, s18, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s41, s21, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s42, s20, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s43, s30, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s44, s30, 31
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s44
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s44, s31, 31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s43
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s43, s28, 31
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s44
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s44, s28, 31
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s44
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s44, s29, 31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s43
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s43, s23, 31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s44
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s44, s22, 31
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s30
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s28
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s29
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s26
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s24
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v16, s22
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s34, s16, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s40, s18, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s42, s20, 31
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s44
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s44, s22, 31
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:240
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s20
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:224
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s18
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s28
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s29
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s22
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s20
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v16, s18
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v20, s16
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s16, s25, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s18, s27, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s20, s26, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s22, s24, 31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s27
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s25
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v18, s23
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s21
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s19
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v22, s17
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s20
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s18
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:208
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s17, s1, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s18, s0, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s19, s3, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s20, s2, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s21, s5, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s23, s4, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s24, s7, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s25, s6, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s26, s9, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s27, s8, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s28, s11, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s29, s10, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s18, s24, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s20, s27, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s22, s26, 31
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[3:6], off, s[36:39], 0 offset:224
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s26
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s22
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s27
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s20
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s18
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s18, s15, 31
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s16
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s16, s14, 31
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:208
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s30, s13, 31
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s31, s12, 31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s22
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s22, s15, 31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s16
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s16, s14, 31
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:192
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(1)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s14
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s15
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s44
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v19, s43
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:176
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(1)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s12
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s13
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s42
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s41
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:160
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s11
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s40
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s35
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:144
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s8
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s9
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v21, s34
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v23, s33
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[36:39], 0 offset:128
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v16, s6
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v18, s7
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s16
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s22
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:112
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s16
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s15
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s18
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s28, s11, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s29, s10, 31
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:112
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s5
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s30
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:96
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s12
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s31
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s13
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s30
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s26, s9, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s27, s8, 31
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:96
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s2
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s3
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s29
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s28
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s24
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s25
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s24, s7, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s25, s6, 31
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s27
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s9
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s26
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s43, s23, 31
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s23
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s22, s5, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s23, s4, 31
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:64
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s25
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s24
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s41, s21, 31
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s21
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s20, s3, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s21, s2, 31
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:48
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s23
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s22
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s33, s17, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s35, s19, 31
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v18, s19
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v22, s17
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s17, s1, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s19, s0, 31
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:32
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s21
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s20
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s44
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s43
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s42
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s41
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s40
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v19, s35
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v21, s34
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v23, s33
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:16
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s19
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s27
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s26
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:64
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s25
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v19, s24
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:48
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s23
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s21
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:32
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s20
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s19
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:16
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s17
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:192
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:176
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:160
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:144
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[36:39], 0 offset:128
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
@@ -4927,12 +4947,12 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX6-NOHSA-LABEL: constant_zextload_v32i32_to_v32i64:
 ; GFX6-NOHSA:       ; %bb.0:
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dwordx16 s[16:31], s[2:3], 0x10
-; GFX6-NOHSA-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s38, -1
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NOHSA-NEXT:    s_load_dwordx16 s[16:31], s[2:3], 0x10
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s36, s0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s37, s1
 ; GFX6-NOHSA-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
@@ -5595,10 +5615,10 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
 ; GFX6-NOHSA-LABEL: constant_load_v32i32:
 ; GFX6-NOHSA:       ; %bb.0:
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dwordx16 s[16:31], s[2:3], 0x10
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s38, -1
+; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NOHSA-NEXT:    s_load_dwordx16 s[16:31], s[2:3], 0x10
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s36, s0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s37, s1
 ; GFX6-NOHSA-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 230232fe21524..bc666066a7acd 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefix=GFX6 %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck --check-prefix=GFX7 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG %s
@@ -166,17 +166,17 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x4
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NEXT:    v_mov_b32_e32 v1, s9
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
-; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0 offset:16
+; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@@ -285,13 +285,12 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
+; GFX6-NEXT:    v_mov_b32_e32 v4, s0
+; GFX6-NEXT:    v_mov_b32_e32 v5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v6, s2
+; GFX6-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s2
-; GFX6-NEXT:    v_mov_b32_e32 v3, s3
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX6-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: constant_load_v4i64:
@@ -400,22 +399,21 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s15
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NEXT:    v_mov_b32_e32 v1, s9
-; GFX6-NEXT:    v_mov_b32_e32 v2, s10
-; GFX6-NEXT:    v_mov_b32_e32 v3, s11
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
-; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
+; GFX6-NEXT:    v_mov_b32_e32 v4, s8
+; GFX6-NEXT:    v_mov_b32_e32 v5, s9
+; GFX6-NEXT:    v_mov_b32_e32 v6, s10
+; GFX6-NEXT:    v_mov_b32_e32 v7, s11
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:32
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -563,10 +561,10 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
 ; GFX6-LABEL: constant_load_v16i64:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dwordx16 s[16:31], s[2:3], 0x10
 ; GFX6-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s38, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_load_dwordx16 s[16:31], s[2:3], 0x10
 ; GFX6-NEXT:    s_mov_b32 s36, s0
 ; GFX6-NEXT:    s_mov_b32 s37, s1
 ; GFX6-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index e632e50359749..14c1e04776561 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GFX6-NOHSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6-NOHSA %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GFX7-HSA %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8-NOHSA %s
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
@@ -210,11 +210,11 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s5, s4, 16
-; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
 ; GFX6-NOHSA-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
@@ -1088,13 +1088,12 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s5, s4, 0x80008
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s6, s4, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_and_b32 s6, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NOHSA-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NOHSA-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:8
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
@@ -1191,13 +1190,12 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s5, s4, 0x80008
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s6, s4, 0x80010
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s6, s4
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s4, s4, 0x80010
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NOHSA-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NOHSA-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:8
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
@@ -1290,18 +1288,18 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX6-NOHSA:       ; %bb.0:
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX6-NOHSA-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s4, s2, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s5, s2, 0x80008
-; GFX6-NOHSA-NEXT:    s_and_b32 s6, s2, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s7, s2, 0x80010
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s4
+; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s5, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s6, s4, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s7, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
@@ -1397,18 +1395,18 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX6-NOHSA:       ; %bb.0:
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX6-NOHSA-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s4, s2, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s5, s2, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s6, s2, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s7, s2
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s7
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s6
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s4
+; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s5, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s6, s4, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s7, s4, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
@@ -1511,13 +1509,13 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s6, s4, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s7, s4, 0x80008
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s8, s5, 24
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s9, s5, 0x80008
-; GFX6-NOHSA-NEXT:    s_and_b32 s10, s4, 0xff
 ; GFX6-NOHSA-NEXT:    s_and_b32 s11, s5, 0xff
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s5, s5, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s6, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s7, s4, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s10, s4, 0xff
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s9
@@ -1670,13 +1668,13 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s6, s4, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s7, s4, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s8, s4, 0x80008
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s9, s5, 24
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s10, s5, 0x80010
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s11, s5, 0x80008
 ; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s5, s5
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s6, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s7, s4, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s8, s4, 0x80008
 ; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s11
@@ -1833,32 +1831,32 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s8, s4, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s9, s4, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s10, s5, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s11, s5, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s12, s6, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s13, s6, 0x80008
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s14, s7, 24
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s15, s7, 0x80008
-; GFX6-NOHSA-NEXT:    s_and_b32 s16, s4, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s17, s5, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s5, s5, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s18, s6, 0xff
 ; GFX6-NOHSA-NEXT:    s_and_b32 s19, s7, 0xff
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s7, s7, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s12, s6, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s13, s6, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s18, s6, 0xff
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s6, s6, 0x80010
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s19
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s14
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s10, s5, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s11, s5, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s17, s5, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s5, s5, 0x80010
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s12
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s8, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s9, s4, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s16, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s17
@@ -2096,32 +2094,32 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s8, s4, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s9, s4, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s10, s4, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s11, s5, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s12, s5, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s13, s5, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s5, s5
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s14, s6, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s15, s6, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s16, s6, 0x80008
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s17, s7, 24
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s18, s7, 0x80010
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s19, s7, 0x80008
 ; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s7, s7
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s14, s6, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s15, s6, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s16, s6, 0x80008
 ; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s6, s6
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s19
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s17
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s11, s5, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s12, s5, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s13, s5, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s5, s5
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s15
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s14
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s8, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s9, s4, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s10, s4, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
@@ -2367,72 +2365,72 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s12, s4, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s13, s4, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s14, s5, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s15, s5, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s16, s6, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s17, s6, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s18, s7, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s19, s7, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s20, s8, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s21, s8, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s22, s9, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s23, s9, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s24, s10, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s25, s10, 0x80008
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s26, s11, 24
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s27, s11, 0x80008
-; GFX6-NOHSA-NEXT:    s_and_b32 s28, s4, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s29, s5, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s5, s5, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s30, s6, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s6, s6, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s31, s7, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s7, s7, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s33, s8, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s8, s8, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s34, s9, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s9, s9, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s35, s10, 0xff
 ; GFX6-NOHSA-NEXT:    s_and_b32 s36, s11, 0xff
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s11, s11, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s24, s10, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s25, s10, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s35, s10, 0xff
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s10, s10, 0x80010
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s36
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s27
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s26
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s22, s9, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s23, s9, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s34, s9, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s9, s9, 0x80010
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s35
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s25
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s20, s8, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s21, s8, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s33, s8, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s8, s8, 0x80010
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s34
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s23
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s22
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s18, s7, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s19, s7, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s31, s7, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s7, s7, 0x80010
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s33
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s21
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s20
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s16, s6, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s17, s6, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s30, s6, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s6, s6, 0x80010
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s31
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s19
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s18
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s14, s5, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s15, s5, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s29, s5, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s5, s5, 0x80010
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s30
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s16
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s12, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s13, s4, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s28, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s29
@@ -2841,72 +2839,72 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s12, s4, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s13, s4, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s14, s4, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s15, s5, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s16, s5, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s17, s5, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s5, s5
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s18, s6, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s19, s6, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s20, s6, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s6, s6
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s21, s7, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s22, s7, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s23, s7, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s7, s7
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s24, s8, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s25, s8, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s26, s8, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s8, s8
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s27, s9, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s28, s9, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s29, s9, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s9, s9
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s30, s10, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s31, s10, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s33, s10, 0x80008
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s34, s11, 24
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s35, s11, 0x80010
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s36, s11, 0x80008
 ; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s11, s11
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s30, s10, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s31, s10, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s33, s10, 0x80008
 ; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s10, s10
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s36
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s35
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s34
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s27, s9, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s28, s9, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s29, s9, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s9, s9
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s33
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s31
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s30
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s24, s8, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s25, s8, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s26, s8, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s8, s8
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s9
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s29
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s28
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s27
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s21, s7, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s22, s7, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s23, s7, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s7, s7
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s26
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s25
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s24
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s18, s6, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s19, s6, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s20, s6, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s6, s6
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s23
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s22
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s21
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s15, s5, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s16, s5, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s17, s5, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s5, s5
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s19
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s18
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s12, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s13, s4, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s14, s4, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
@@ -3335,100 +3333,84 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s18, s0, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s19, s0, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s20, s1, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s21, s1, 0x80008
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s22, s2, 24
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s23, s2, 0x80008
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s24, s3, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s27, s3, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s28, s4, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s29, s4, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s30, s5, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s31, s5, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s33, s6, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s34, s6, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s35, s7, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s36, s7, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s37, s8, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s38, s8, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s39, s9, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s40, s9, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s41, s10, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s42, s10, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s43, s11, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s44, s11, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s45, s12, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s46, s12, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s47, s13, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s48, s13, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s49, s14, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s50, s14, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s51, s15, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s52, s15, 0x80008
-; GFX6-NOHSA-NEXT:    s_and_b32 s26, s0, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s25, s0, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s53, s1, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s54, s1, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s25, s3, 0x80008
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s49, s15, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s50, s15, 0x80008
 ; GFX6-NOHSA-NEXT:    s_and_b32 s55, s2, 0xff
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s56, s2, 0x80010
 ; GFX6-NOHSA-NEXT:    s_and_b32 s57, s3, 0xff
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s58, s3, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s59, s4, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s60, s5, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s5, s5, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s61, s6, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s6, s6, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s62, s7, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s7, s7, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s63, s8, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s8, s8, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s64, s9, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s9, s9, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s65, s10, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s66, s11, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s11, s11, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s67, s12, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s12, s12, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s68, s13, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s13, s13, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s69, s14, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s14, s14, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s70, s15, 0xff
+; GFX6-NOHSA-NEXT:    s_and_b32 s2, s11, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s3, s11, 0x80010
+; GFX6-NOHSA-NEXT:    s_and_b32 s68, s15, 0xff
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s15, s15, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s18, s0, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s19, s0, 0x80008
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s20, s1, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s21, s1, 0x80008
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s39, s10, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s40, s10, 0x80008
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s47, s14, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s48, s14, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s51, s0, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s52, s0, 0x80010
+; GFX6-NOHSA-NEXT:    s_and_b32 s53, s1, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s54, s1, 0x80010
+; GFX6-NOHSA-NEXT:    s_and_b32 s65, s10, 0xff
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s10, s10, 0x80010
+; GFX6-NOHSA-NEXT:    s_and_b32 s67, s14, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s14, s14, 0x80010
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s0, s16
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s1, s17
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s68
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s50
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s15
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s49
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v16, s2
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v18, s3
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s70
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s52
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s15
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s51
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s69
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s50
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s37, s9, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s38, s9, 0x80008
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s41, s11, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s42, s11, 0x80008
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s43, s12, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s44, s12, 0x80008
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s45, s13, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s46, s13, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s64, s9, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s9, s9, 0x80010
+; GFX6-NOHSA-NEXT:    s_and_b32 s11, s12, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s12, s12, 0x80010
+; GFX6-NOHSA-NEXT:    s_and_b32 s66, s13, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s13, s13, 0x80010
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s67
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s48
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s14
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s49
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s68
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s48
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s13
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s47
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s67
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s46
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s12
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s45
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v16, s66
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s44
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v18, s11
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s47
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s65
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v19, s43
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s42
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s40
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s10
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s41
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s39
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s35, s8, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s36, s8, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s63, s8, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s8, s8, 0x80010
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s66
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s46
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s13
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s45
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s11
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s44
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s12
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s43
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s42
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v19, s41
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
@@ -3436,43 +3418,59 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s64
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s40
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s38
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s9
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s39
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s37
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s33, s7, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s34, s7, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s62, s7, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s7, s7, 0x80010
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s63
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s38
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s36
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s37
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s35
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s30, s6, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s31, s6, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s61, s6, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s6, s6, 0x80010
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s62
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s36
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s34
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s35
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s33
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s28, s5, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s29, s5, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s60, s5, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s5, s5, 0x80010
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s61
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s34
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s31
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s33
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s30
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s26, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s27, s4, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s59, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s60
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s31
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s29
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s30
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s28
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s59
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s29
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s27
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s28
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s26
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s57
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s27
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s25
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s58
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s24
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
@@ -3489,9 +3487,9 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s26
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s51
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s19
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s25
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s52
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s18
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
@@ -4235,14 +4233,6 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s18, s0, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s19, s0, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s20, s0, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s21, s0
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s22, s1, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s23, s1, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s24, s1, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s25, s1
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s26, s2, 24
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s27, s2, 0x80010
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s28, s2, 0x80008
@@ -4251,84 +4241,76 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s31, s3, 0x80010
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s33, s3, 0x80008
 ; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s34, s3
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s35, s4, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s36, s4, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s37, s4, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s38, s5, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s39, s5, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s40, s5, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s5, s5
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s41, s6, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s42, s6, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s43, s6, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s6, s6
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s44, s7, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s45, s7, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s46, s7, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s7, s7
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s47, s8, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s48, s8, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s49, s8, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s8, s8
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s50, s9, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s51, s9, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s52, s9, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s9, s9
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s2, s11, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s3, s11, 0x80008
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s66, s15, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s67, s15, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s68, s15, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s15, s15
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s18, s0, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s19, s0, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s20, s0, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s21, s0
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s22, s1, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s23, s1, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s24, s1, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s25, s1
 ; GFX6-NOHSA-NEXT:    s_ashr_i32 s53, s10, 24
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s54, s10, 0x80010
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s55, s10, 0x80008
 ; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s10, s10
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s56, s11, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s57, s11, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s58, s11
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s59, s12, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s60, s12, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s61, s12, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s12, s12
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s62, s13, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s63, s13, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s64, s13, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s13, s13
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s65, s14, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s66, s14, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s67, s14, 0x80008
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s63, s14, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s64, s14, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s65, s14, 0x80008
 ; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s14, s14
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s68, s15, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s69, s15, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s70, s15, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s15, s15
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s11, s11, 24
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s0, s16
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s1, s17
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s15
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s68
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s67
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s66
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s3
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v18, s2
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s15
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s70
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s69
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s68
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s50, s9, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s51, s9, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s52, s9, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s9, s9
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s56, s11, 24
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s11, s11
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s57, s12, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s58, s12, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s59, s12, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s12, s12
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s60, s13, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s61, s13, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s62, s13, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s13, s13
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s14
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s67
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s66
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s65
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s13
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s64
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s63
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s62
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s12
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s61
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s60
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s59
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v16, s58
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s57
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v18, s56
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s65
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s64
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s63
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v19, s11
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s55
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s54
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s53
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s47, s8, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s48, s8, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s49, s8, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s8, s8
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s13
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s62
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s61
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s60
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s12
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s59
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s58
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s57
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v16, s11
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v19, s56
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
@@ -4339,24 +4321,40 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s52
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s51
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s50
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s44, s7, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s45, s7, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s46, s7, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s7, s7
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s49
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s48
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s47
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s41, s6, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s42, s6, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s43, s6, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s6, s6
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s46
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s45
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s44
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s38, s5, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s39, s5, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s40, s5, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s5, s5
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s43
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s41
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s35, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s36, s4, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s37, s4, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
@@ -5559,10 +5557,10 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NOHSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NOHSA-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX6-NOHSA-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
@@ -5688,8 +5686,8 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX6-NOHSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NOHSA-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX6-NOHSA-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX6-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX6-NOHSA-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX6-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX6-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
@@ -5807,24 +5805,26 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX6-NOHSA-LABEL: constant_zextload_v4i8_to_v4i64:
 ; GFX6-NOHSA:       ; %bb.0:
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dword s4, s[2:3], 0x0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
+; GFX6-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s5, s4, 0x80008
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s6, s4, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s7, s4, 0x80010
-; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s7
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GFX6-NOHSA-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s0, s2, 0x80008
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s1, s2, 24
+; GFX6-NOHSA-NEXT:    s_and_b32 s3, s2, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s2, s2, 0x80010
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s1
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s3
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
 ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64:
@@ -5950,18 +5950,18 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s6, s4, 16
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s8, s4, 24
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s10, s4, 8
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x80000
 ; GFX6-NOHSA-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x80000
 ; GFX6-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x80000
 ; GFX6-NOHSA-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s9
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s7
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s8
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s9
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@@ -6097,20 +6097,20 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX6-NOHSA-LABEL: constant_zextload_v8i8_to_v8i64:
 ; GFX6-NOHSA:       ; %bb.0:
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s6, s4, 24
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s7, s5, 24
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s8, s5, 0x80008
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s9, s4, 0x80008
-; GFX6-NOHSA-NEXT:    s_and_b32 s10, s4, 0xff
 ; GFX6-NOHSA-NEXT:    s_and_b32 s11, s5, 0xff
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s5, s5, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s6, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s9, s4, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s10, s4, 0xff
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
@@ -6307,51 +6307,55 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
 define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GFX6-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64:
 ; GFX6-NOHSA:       ; %bb.0:
-; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GFX6-NOHSA-NEXT:    s_mov_b32 s9, 0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s11, s9
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NOHSA-NEXT:    s_mov_b32 s7, 0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
-; GFX6-NOHSA-NEXT:    s_mov_b32 s9, s7
-; GFX6-NOHSA-NEXT:    s_mov_b32 s11, s7
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s6, s5, 16
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s8, s5, 8
-; GFX6-NOHSA-NEXT:    s_mov_b32 s10, s5
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s14, s4, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s16, s4, 8
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[18:19], s[4:5], 0x80000
+; GFX6-NOHSA-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s0, s4
+; GFX6-NOHSA-NEXT:    s_mov_b32 s1, s5
+; GFX6-NOHSA-NEXT:    s_mov_b32 s5, s9
+; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s8, s7, 16
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s10, s7, 8
+; GFX6-NOHSA-NEXT:    s_mov_b32 s4, s7
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s12, s6, 16
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s14, s6, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s16, s6, 8
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[18:19], s[6:7], 0x80000
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s20, s7, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s21, s7, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[8:9], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x80000
 ; GFX6-NOHSA-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s17, s5, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s20, s5, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[16:17], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s17
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s10
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s11
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s18
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s19
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s21
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s20
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x80000
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s8
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s9
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(1)
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s10
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s11
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s15
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s5
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s18
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s19
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s16
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s17
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64:
@@ -6576,35 +6580,35 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX6-NOHSA-LABEL: constant_zextload_v16i8_to_v16i64:
 ; GFX6-NOHSA:       ; %bb.0:
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s8, s5, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s9, s4, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s10, s7, 24
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s11, s6, 24
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s12, s6, 0x80008
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s13, s7, 0x80008
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s14, s4, 0x80008
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s15, s5, 0x80008
-; GFX6-NOHSA-NEXT:    s_and_b32 s16, s5, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s17, s4, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s18, s7, 0xff
 ; GFX6-NOHSA-NEXT:    s_and_b32 s19, s6, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s5, s5, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s6, s6, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s10, s7, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s13, s7, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s18, s7, 0xff
 ; GFX6-NOHSA-NEXT:    s_bfe_u32 s7, s7, 0x80010
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s11
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s9, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s14, s4, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s17, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s10
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s8, s5, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s15, s5, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s16, s5, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s5, s5, 0x80010
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
@@ -6939,89 +6943,94 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
 define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GFX6-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64:
 ; GFX6-NOHSA:       ; %bb.0:
-; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NOHSA-NEXT:    s_mov_b32 s11, 0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
+; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[10:11], 0x0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s11, 0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s21, s11
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s13, s11
-; GFX6-NOHSA-NEXT:    s_mov_b32 s15, s11
-; GFX6-NOHSA-NEXT:    s_mov_b32 s17, s11
+; GFX6-NOHSA-NEXT:    s_mov_b32 s0, s8
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s10, s7, 16
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s20, s5, 16
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s12, s7, 8
-; GFX6-NOHSA-NEXT:    s_mov_b32 s18, s7
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s20, s6, 16
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s22, s6, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s24, s6, 8
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s14, s5, 16
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s16, s5, 8
-; GFX6-NOHSA-NEXT:    s_mov_b32 s26, s5
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s28, s4, 16
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s30, s4, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s34, s4, 8
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[8:9], s[4:5], 0x80000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s8, s7
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s14, s6, 16
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s16, s6, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s18, s6, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s22, s5, 8
+; GFX6-NOHSA-NEXT:    s_mov_b32 s24, s5
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s26, s4, 16
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s28, s4, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s30, s4, 8
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[34:35], s[4:5], 0x80000
 ; GFX6-NOHSA-NEXT:    s_bfe_i64 s[36:37], s[6:7], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s31, s5, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s33, s5, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s35, s7, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s38, s7, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[34:35], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[30:31], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s33, s5, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s38, s5, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[20:21], 0x80000
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s20, s7, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s21, s7, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[10:11], 0x80000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s1, s9
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s21
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX6-NOHSA-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x80000
 ; GFX6-NOHSA-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s38
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s35
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s18
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s19
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s36
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s37
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s33
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s31
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s26
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s27
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s9
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s12
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s13
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s21
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s12
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s13
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s15
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s16
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s17
+; GFX6-NOHSA-NEXT:    s_mov_b32 s23, s11
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s36
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s37
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s18
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s19
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x80000
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s38
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s33
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s22
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s23
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s24
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s25
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s24
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s25
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s22
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s23
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s14
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s15
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v16, s16
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s17
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s26
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s27
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s28
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s29
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s34
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s35
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s30
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s31
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
@@ -7417,109 +7426,109 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
 define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GFX6-NOHSA-LABEL: constant_zextload_v32i8_to_v32i64:
 ; GFX6-NOHSA:       ; %bb.0:
-; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s11, 0xf000
+; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s12, s0, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s13, s1, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s14, s2, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s15, s3, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s16, s4, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s17, s5, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s18, s6, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s19, s7, 24
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s20, s7, 0x80008
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s21, s6, 0x80008
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s22, s5, 0x80008
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s23, s4, 0x80008
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s24, s3, 0x80008
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s25, s2, 0x80008
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s26, s1, 0x80008
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s27, s0, 0x80008
-; GFX6-NOHSA-NEXT:    s_and_b32 s28, s0, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s29, s1, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s30, s2, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s31, s3, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s33, s4, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s34, s5, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s35, s6, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s36, s7, 0xff
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s1, s1, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s2, s2, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s3, s3, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s5, s5, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s7, s7, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_u32 s6, s6, 0x80010
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s7
+; GFX6-NOHSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
+; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s19, s11, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s20, s11, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s36, s11, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s11, s11, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s18, s10, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s21, s10, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s35, s10, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s10, s10, 0x80010
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s19
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:240
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s17, s9, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s22, s9, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s34, s9, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s9, s9, 0x80010
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s18
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:208
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s16, s8, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s23, s8, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s33, s8, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s8, s8, 0x80010
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s9
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s17
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:176
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s15, s7, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s24, s7, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s31, s7, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s7, s7, 0x80010
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s16
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:144
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s14, s6, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s25, s6, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s30, s6, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s6, s6, 0x80010
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s3
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s15
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s13, s5, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s26, s5, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s29, s5, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s5, s5, 0x80010
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s14
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s12, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s27, s4, 0x80008
+; GFX6-NOHSA-NEXT:    s_and_b32 s28, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s13
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s12
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s36
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:224
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s35
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s21
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:192
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s34
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s22
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:160
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s33
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s23
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:128
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s31
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s24
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s30
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s25
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s29
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s26
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s28
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s27
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
 ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64:
@@ -8097,169 +8106,171 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX6-NOHSA-LABEL: constant_sextload_v32i8_to_v32i64:
 ; GFX6-NOHSA:       ; %bb.0:
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX6-NOHSA-NEXT:    s_mov_b32 s57, 0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s47, s57
+; GFX6-NOHSA-NEXT:    s_mov_b32 s19, s57
+; GFX6-NOHSA-NEXT:    s_mov_b32 s15, s57
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GFX6-NOHSA-NEXT:    s_mov_b32 s21, s57
+; GFX6-NOHSA-NEXT:    s_mov_b32 s23, s57
+; GFX6-NOHSA-NEXT:    s_mov_b32 s39, s57
+; GFX6-NOHSA-NEXT:    s_mov_b32 s41, s57
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s24, s7, 16
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s12, s7, 8
-; GFX6-NOHSA-NEXT:    s_mov_b32 s26, s7
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s22, s6, 16
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s16, s6, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s20, s6, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s56, s7, 16
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s46, s7, 8
+; GFX6-NOHSA-NEXT:    s_mov_b32 s60, s7
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s50, s6, 16
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s36, s6, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s24, s6, 8
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s33, s3, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s43, s3, 24
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s31, s7, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s35, s7, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s20, s3, 16
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s22, s3, 8
+; GFX6-NOHSA-NEXT:    s_mov_b32 s26, s3
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s28, s2, 16
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s30, s2, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s34, s2, 8
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[12:13], s[2:3], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[2:3], s[6:7], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[56:57], 0x80000
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s7
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[60:61], 0x80000
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[46:47], 0x80000
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s25, s1, 24
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s7
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[50:51], 0x80000
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s18, s5, 16
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s14, s5, 8
-; GFX6-NOHSA-NEXT:    s_mov_b32 s10, s5
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s27, s1, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s33, s1, 24
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s17, s3, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s13, s3, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s21, s5, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s15, s5, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[58:59], s[26:27], 0x80000
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s19, s7, 31
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s23, s7, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s42, s4, 16
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s38, s4, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s26, s4, 8
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s28, s3, 16
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s30, s3, 8
-; GFX6-NOHSA-NEXT:    s_mov_b32 s52, s3
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s34, s2, 16
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s36, s2, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s40, s2, 8
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s44, s1, 16
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s46, s1, 8
-; GFX6-NOHSA-NEXT:    s_mov_b32 s56, s1
+; GFX6-NOHSA-NEXT:    s_mov_b32 s16, s5
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s17, s1, 31
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s7
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[36:37], 0x80000
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s2
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s3
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[2:3], s[24:25], 0x80000
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s62, s4, 16
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s58, s4, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s38, s1, 16
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s40, s1, 8
+; GFX6-NOHSA-NEXT:    s_mov_b32 s44, s1
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s48, s0, 16
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s50, s0, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s52, s0, 24
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s54, s0, 8
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[10:11], s[0:1], 0x80000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s0, s8
+; GFX6-NOHSA-NEXT:    s_mov_b32 s1, s9
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s2
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s3
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[2:3], s[18:19], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[16:17], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[8:9], s[14:15], 0x80000
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s42, s4, 8
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v16, s2
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s3
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s35
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s31
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v20, s6
+; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v21, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v22, s8
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v23, s9
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[62:63], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[8:9], s[58:59], 0x80000
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s27, s5, 31
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s29, s5, 24
 ; GFX6-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x80000
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s58
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s59
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s6
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s7
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s10
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s11
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v18, s23
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v19, s19
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s15
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s13
-; GFX6-NOHSA-NEXT:    s_mov_b32 s25, 0
-; GFX6-NOHSA-NEXT:    s_mov_b32 s11, 0xf000
-; GFX6-NOHSA-NEXT:    s_mov_b32 s13, s25
-; GFX6-NOHSA-NEXT:    s_mov_b32 s19, s25
-; GFX6-NOHSA-NEXT:    s_mov_b32 s15, s25
-; GFX6-NOHSA-NEXT:    s_mov_b32 s29, s25
-; GFX6-NOHSA-NEXT:    s_mov_b32 s31, s25
-; GFX6-NOHSA-NEXT:    s_mov_b32 s45, s25
-; GFX6-NOHSA-NEXT:    s_mov_b32 s47, s25
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[24:25], 0x80000
-; GFX6-NOHSA-NEXT:    s_mov_b32 s10, -1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s21
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v15, s17
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v16, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s5
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[8:11], 0 offset:240
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[24:25], s[56:57], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[52:53], s[52:53], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[56:57], s[22:23], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[58:59], s[12:13], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[60:61], s[20:21], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[62:63], s[16:17], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[64:65], s[18:19], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[66:67], s[14:15], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[54:55], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[50:51], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[12:13], s[48:49], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[14:15], s[46:47], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[16:17], s[44:45], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[18:19], s[40:41], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[20:21], s[36:37], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[22:23], s[34:35], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GFX6-NOHSA-NEXT:    s_bfe_i64 s[34:35], s[26:27], 0x80000
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s58
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s59
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[10:13], off, s[8:11], 0 offset:224
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s56
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s57
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s62
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s63
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[10:13], off, s[8:11], 0 offset:208
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v16, s52
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v17, s53
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s60
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s61
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:192
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s9
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[42:43], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v19, s27
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s2
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v9, s3
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s64
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s65
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:176
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s7
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x80000
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(1)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s20
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s21
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s43
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s33
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v18, s29
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s33
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s27
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s66
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s67
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:160
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s26
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s27
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s22
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s23
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x80000
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s42
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s43
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s38
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s39
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:144
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v20, s24
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v21, s25
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s28
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s29
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s30
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s31
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[36:37], s[38:39], 0x80000
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s12
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s35
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:128
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[14:15], s[44:45], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[18:19], s[40:41], 0x80000
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s28
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v13, s29
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:112
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v18, s30
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v19, s31
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[8:11], 0 offset:96
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s22
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s23
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s20
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s21
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:80
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s18
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v11, s19
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:64
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(1)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s16
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s17
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:48
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v22, s14
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v23, s15
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[8:11], 0 offset:32
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s12
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s13
-; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(1)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:16
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s36
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s37
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s25
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s17
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[52:53], 0x80000
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[8:9], s[48:49], 0x80000
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s15
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s18
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s19
+; GFX6-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[54:55], 0x80000
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s9
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s7
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s5
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64:
@@ -9667,15 +9678,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s5, s4, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s2, s4, 0xff00
-; GFX6-NOHSA-NEXT:    s_and_b32 s6, s4, 0xff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX6-NOHSA-NEXT:    s_and_b32 s6, s4, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s7, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s6, s6, 8
 ; GFX6-NOHSA-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
-; GFX6-NOHSA-NEXT:    s_or_b32 s5, s6, s2
+; GFX6-NOHSA-NEXT:    s_or_b32 s5, s7, s6
 ; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff00ff
-; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -9799,22 +9810,22 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX6-NOHSA:       ; %bb.0:
 ; GFX6-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX6-NOHSA-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s4, s2, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s5, s2, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s6, s2, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s2, s2
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX6-NOHSA-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NOHSA-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX6-NOHSA-NEXT:    s_or_b32 s4, s5, s4
-; GFX6-NOHSA-NEXT:    s_or_b32 s5, s2, s6
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s5, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s6, s4, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s7, s4, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s6, s6, 0xffff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX6-NOHSA-NEXT:    s_or_b32 s5, s6, s5
+; GFX6-NOHSA-NEXT:    s_or_b32 s4, s4, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
@@ -9954,27 +9965,27 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s2, s4, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s7, s5, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s8, s4, 0xff00
 ; GFX6-NOHSA-NEXT:    s_and_b32 s9, s5, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s6, s4, 0xff00
 ; GFX6-NOHSA-NEXT:    s_and_b32 s10, s5, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s11, s4, 0xff
-; GFX6-NOHSA-NEXT:    s_mov_b32 s6, s5
 ; GFX6-NOHSA-NEXT:    s_lshl_b32 s9, s9, 8
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s8, s4, 24
+; GFX6-NOHSA-NEXT:    s_or_b32 s9, s10, s9
+; GFX6-NOHSA-NEXT:    s_and_b32 s10, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s7, s5, 24
+; GFX6-NOHSA-NEXT:    s_or_b32 s10, s10, s6
+; GFX6-NOHSA-NEXT:    s_mov_b32 s6, s5
+; GFX6-NOHSA-NEXT:    s_mov_b32 s5, s8
 ; GFX6-NOHSA-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
-; GFX6-NOHSA-NEXT:    s_mov_b32 s5, s2
 ; GFX6-NOHSA-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
-; GFX6-NOHSA-NEXT:    s_or_b32 s5, s10, s9
-; GFX6-NOHSA-NEXT:    s_or_b32 s7, s11, s8
 ; GFX6-NOHSA-NEXT:    s_and_b32 s6, s6, 0xff00ff
 ; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff00ff
-; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
@@ -10158,28 +10169,28 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s2, s5, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s6, s5, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s7, s5, 0x80008
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s6, s5, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s7, s5, 0x80010
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s8, s5, 0x80008
 ; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s5, s5
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s8, s4, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s9, s4, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s10, s4, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NOHSA-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX6-NOHSA-NEXT:    s_or_b32 s6, s7, s6
+; GFX6-NOHSA-NEXT:    s_or_b32 s5, s5, s8
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s7, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s8, s4, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s8, s8, 0xffff
+; GFX6-NOHSA-NEXT:    s_or_b32 s7, s8, s7
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s8, s4, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
 ; GFX6-NOHSA-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX6-NOHSA-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s10, s10, 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX6-NOHSA-NEXT:    s_or_b32 s6, s6, s2
-; GFX6-NOHSA-NEXT:    s_or_b32 s5, s5, s7
-; GFX6-NOHSA-NEXT:    s_or_b32 s7, s9, s8
-; GFX6-NOHSA-NEXT:    s_or_b32 s4, s4, s10
-; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
+; GFX6-NOHSA-NEXT:    s_or_b32 s4, s4, s8
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
@@ -10397,47 +10408,47 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s12, s6, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s9, s7, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s13, s6, 0xff00
-; GFX6-NOHSA-NEXT:    s_and_b32 s14, s7, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s15, s4, 24
+; GFX6-NOHSA-NEXT:    s_and_b32 s15, s5, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s10, s4, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s16, s5, 0xff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s15, s15, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s14, s4, 24
+; GFX6-NOHSA-NEXT:    s_or_b32 s15, s16, s15
+; GFX6-NOHSA-NEXT:    s_and_b32 s16, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s10, s10, 8
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s11, s5, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s16, s4, 0xff00
-; GFX6-NOHSA-NEXT:    s_and_b32 s17, s5, 0xff00
-; GFX6-NOHSA-NEXT:    s_and_b32 s18, s5, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s19, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_or_b32 s16, s16, s10
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s10, s5
-; GFX6-NOHSA-NEXT:    s_and_b32 s20, s7, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s21, s6, 0xff
-; GFX6-NOHSA-NEXT:    s_mov_b32 s8, s7
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s17, s17, 8
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s16, s16, 8
+; GFX6-NOHSA-NEXT:    s_mov_b32 s5, s14
+; GFX6-NOHSA-NEXT:    s_and_b32 s13, s7, 0xff00
 ; GFX6-NOHSA-NEXT:    s_lshr_b64 s[10:11], s[10:11], 16
-; GFX6-NOHSA-NEXT:    s_mov_b32 s5, s15
 ; GFX6-NOHSA-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s5, s14, 8
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s11, s13, 8
-; GFX6-NOHSA-NEXT:    s_lshr_b64 s[8:9], s[8:9], 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s8, s6, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s11, s4, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_and_b32 s4, s7, 0xff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s5, s13, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s12, s6, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s9, s7, 24
+; GFX6-NOHSA-NEXT:    s_or_b32 s13, s4, s5
+; GFX6-NOHSA-NEXT:    s_and_b32 s4, s6, 0xff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s5, s8, 8
+; GFX6-NOHSA-NEXT:    s_mov_b32 s8, s7
+; GFX6-NOHSA-NEXT:    s_or_b32 s14, s4, s5
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[4:5], s[8:9], 16
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s7, s12
-; GFX6-NOHSA-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
-; GFX6-NOHSA-NEXT:    s_or_b32 s7, s18, s17
-; GFX6-NOHSA-NEXT:    s_or_b32 s9, s19, s16
-; GFX6-NOHSA-NEXT:    s_and_b32 s10, s10, 0xff00ff
-; GFX6-NOHSA-NEXT:    s_or_b32 s5, s20, s5
-; GFX6-NOHSA-NEXT:    s_or_b32 s11, s21, s11
-; GFX6-NOHSA-NEXT:    s_and_b32 s8, s8, 0xff00ff
-; GFX6-NOHSA-NEXT:    s_and_b32 s6, s6, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_and_b32 s8, s4, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[4:5], s[6:7], 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff00ff
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s11
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s6
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NOHSA-NEXT:    s_and_b32 s10, s10, 0xff00ff
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s13
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s9
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s11
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s15
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
@@ -10757,42 +10768,42 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s9, s5, 0x80010
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s10, s5, 0x80008
 ; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s5, s5
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s11, s4, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s12, s4, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s13, s4, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s14, s7, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s15, s7, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s16, s7, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s7, s7
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s17, s6, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s18, s6, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s19, s6, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s6, s6
 ; GFX6-NOHSA-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s9, s9, 0xffff
 ; GFX6-NOHSA-NEXT:    s_lshl_b32 s10, s10, 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s11, s11, 16
-; GFX6-NOHSA-NEXT:    s_and_b32 s12, s12, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s13, s13, 16
+; GFX6-NOHSA-NEXT:    s_or_b32 s8, s9, s8
+; GFX6-NOHSA-NEXT:    s_or_b32 s5, s5, s10
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s9, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s10, s4, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s9, s9, 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s10, s10, 0xffff
+; GFX6-NOHSA-NEXT:    s_or_b32 s9, s10, s9
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s10, s4, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s10, s10, 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s14, s14, 16
-; GFX6-NOHSA-NEXT:    s_and_b32 s15, s15, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s16, s16, 16
+; GFX6-NOHSA-NEXT:    s_or_b32 s4, s4, s10
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s10, s7, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s11, s7, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s10, s10, 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s11, s11, 0xffff
+; GFX6-NOHSA-NEXT:    s_or_b32 s10, s11, s10
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s11, s7, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s7, s7
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s11, s11, 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s17, s17, 16
-; GFX6-NOHSA-NEXT:    s_and_b32 s18, s18, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s19, s19, 16
+; GFX6-NOHSA-NEXT:    s_or_b32 s7, s7, s11
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s11, s6, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s12, s6, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s11, s11, 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s12, s12, 0xffff
+; GFX6-NOHSA-NEXT:    s_or_b32 s11, s12, s11
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s12, s6, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s6, s6
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s12, s12, 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX6-NOHSA-NEXT:    s_or_b32 s8, s9, s8
-; GFX6-NOHSA-NEXT:    s_or_b32 s5, s5, s10
-; GFX6-NOHSA-NEXT:    s_or_b32 s9, s12, s11
-; GFX6-NOHSA-NEXT:    s_or_b32 s10, s15, s14
-; GFX6-NOHSA-NEXT:    s_or_b32 s7, s7, s16
-; GFX6-NOHSA-NEXT:    s_or_b32 s11, s18, s17
-; GFX6-NOHSA-NEXT:    s_or_b32 s6, s6, s19
-; GFX6-NOHSA-NEXT:    s_or_b32 s4, s4, s13
+; GFX6-NOHSA-NEXT:    s_or_b32 s6, s6, s12
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
@@ -11178,72 +11189,71 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s20, s6, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s13, s7, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s21, s6, 0xff00
-; GFX6-NOHSA-NEXT:    s_and_b32 s22, s7, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s23, s4, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s15, s5, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s24, s4, 0xff00
-; GFX6-NOHSA-NEXT:    s_and_b32 s25, s5, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s26, s2, 24
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s17, s3, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s27, s2, 0xff00
-; GFX6-NOHSA-NEXT:    s_and_b32 s28, s3, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s29, s0, 24
+; GFX6-NOHSA-NEXT:    s_and_b32 s27, s1, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s18, s0, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s28, s1, 0xff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s27, s27, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s26, s0, 24
+; GFX6-NOHSA-NEXT:    s_or_b32 s27, s28, s27
+; GFX6-NOHSA-NEXT:    s_and_b32 s28, s0, 0xff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s18, s18, 8
 ; GFX6-NOHSA-NEXT:    s_lshr_b32 s19, s1, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s30, s0, 0xff00
-; GFX6-NOHSA-NEXT:    s_and_b32 s31, s1, 0xff00
-; GFX6-NOHSA-NEXT:    s_and_b32 s33, s1, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s34, s0, 0xff
+; GFX6-NOHSA-NEXT:    s_or_b32 s28, s28, s18
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s18, s1
-; GFX6-NOHSA-NEXT:    s_and_b32 s35, s3, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s36, s2, 0xff
+; GFX6-NOHSA-NEXT:    s_mov_b32 s1, s26
+; GFX6-NOHSA-NEXT:    s_and_b32 s25, s3, 0xff00
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[18:19], s[18:19], 16
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s16, s2, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s19, s0, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_and_b32 s0, s3, 0xff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s1, s25, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s24, s2, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s17, s3, 24
+; GFX6-NOHSA-NEXT:    s_or_b32 s25, s0, s1
+; GFX6-NOHSA-NEXT:    s_and_b32 s0, s2, 0xff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s1, s16, 8
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s16, s3
-; GFX6-NOHSA-NEXT:    s_and_b32 s37, s5, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s38, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_or_b32 s26, s0, s1
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[0:1], s[16:17], 16
+; GFX6-NOHSA-NEXT:    s_mov_b32 s3, s24
+; GFX6-NOHSA-NEXT:    s_and_b32 s23, s5, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s16, s0, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[0:1], s[2:3], 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s14, s4, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s2, s0, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_and_b32 s0, s5, 0xff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s1, s23, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s22, s4, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s15, s5, 24
+; GFX6-NOHSA-NEXT:    s_or_b32 s3, s0, s1
+; GFX6-NOHSA-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s1, s14, 8
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s14, s5
-; GFX6-NOHSA-NEXT:    s_and_b32 s39, s7, 0xff
-; GFX6-NOHSA-NEXT:    s_and_b32 s40, s6, 0xff
+; GFX6-NOHSA-NEXT:    s_or_b32 s17, s0, s1
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[0:1], s[14:15], 16
+; GFX6-NOHSA-NEXT:    s_mov_b32 s5, s22
+; GFX6-NOHSA-NEXT:    s_and_b32 s21, s7, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s14, s0, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[0:1], s[4:5], 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s12, s6, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s4, s0, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_and_b32 s0, s7, 0xff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s1, s21, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s20, s6, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s13, s7, 24
+; GFX6-NOHSA-NEXT:    s_or_b32 s5, s0, s1
+; GFX6-NOHSA-NEXT:    s_and_b32 s0, s6, 0xff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s1, s12, 8
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s12, s7
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s31, s31, 8
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s30, s30, 8
-; GFX6-NOHSA-NEXT:    s_lshr_b64 s[18:19], s[18:19], 16
-; GFX6-NOHSA-NEXT:    s_mov_b32 s1, s29
-; GFX6-NOHSA-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s1, s28, 8
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s19, s27, 8
-; GFX6-NOHSA-NEXT:    s_lshr_b64 s[16:17], s[16:17], 16
-; GFX6-NOHSA-NEXT:    s_mov_b32 s3, s26
-; GFX6-NOHSA-NEXT:    s_lshr_b64 s[2:3], s[2:3], 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s3, s25, 8
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s17, s24, 8
-; GFX6-NOHSA-NEXT:    s_lshr_b64 s[14:15], s[14:15], 16
-; GFX6-NOHSA-NEXT:    s_mov_b32 s5, s23
-; GFX6-NOHSA-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s5, s22, 8
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s15, s21, 8
-; GFX6-NOHSA-NEXT:    s_lshr_b64 s[12:13], s[12:13], 16
+; GFX6-NOHSA-NEXT:    s_or_b32 s15, s0, s1
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[0:1], s[12:13], 16
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s7, s20
-; GFX6-NOHSA-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
-; GFX6-NOHSA-NEXT:    s_or_b32 s7, s33, s31
-; GFX6-NOHSA-NEXT:    s_or_b32 s13, s34, s30
-; GFX6-NOHSA-NEXT:    s_and_b32 s18, s18, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_and_b32 s12, s0, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[0:1], s[6:7], 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s0, s0, 0xff00ff
-; GFX6-NOHSA-NEXT:    s_or_b32 s1, s35, s1
-; GFX6-NOHSA-NEXT:    s_or_b32 s19, s36, s19
-; GFX6-NOHSA-NEXT:    s_and_b32 s16, s16, 0xff00ff
-; GFX6-NOHSA-NEXT:    s_and_b32 s2, s2, 0xff00ff
-; GFX6-NOHSA-NEXT:    s_or_b32 s3, s37, s3
-; GFX6-NOHSA-NEXT:    s_or_b32 s17, s38, s17
-; GFX6-NOHSA-NEXT:    s_and_b32 s14, s14, 0xff00ff
-; GFX6-NOHSA-NEXT:    s_or_b32 s5, s39, s5
-; GFX6-NOHSA-NEXT:    s_or_b32 s15, s40, s15
-; GFX6-NOHSA-NEXT:    s_and_b32 s12, s12, 0xff00ff
-; GFX6-NOHSA-NEXT:    s_and_b32 s6, s6, 0xff00ff
-; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff00ff
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s15
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48
@@ -11252,17 +11262,18 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s14
+; GFX6-NOHSA-NEXT:    s_and_b32 s18, s18, 0xff00ff
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s19
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s26
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s2
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s1
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s25
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
 ; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s13
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s28
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s19
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s27
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s18
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
@@ -11838,82 +11849,82 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s13, s1, 0x80010
 ; GFX6-NOHSA-NEXT:    s_bfe_i32 s14, s1, 0x80008
 ; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s1, s1
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s15, s0, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s16, s0, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s17, s0, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s0, s0
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s18, s3, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s19, s3, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s20, s3, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s3, s3
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s21, s2, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s22, s2, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s23, s2, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s2, s2
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s24, s5, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s25, s5, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s26, s5, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s5, s5
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s27, s4, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s28, s4, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s29, s4, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s30, s7, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s31, s7, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s33, s7, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s7, s7
-; GFX6-NOHSA-NEXT:    s_ashr_i32 s34, s6, 24
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s35, s6, 0x80010
-; GFX6-NOHSA-NEXT:    s_bfe_i32 s36, s6, 0x80008
-; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s6, s6
 ; GFX6-NOHSA-NEXT:    s_lshl_b32 s12, s12, 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s13, s13, 0xffff
 ; GFX6-NOHSA-NEXT:    s_lshl_b32 s14, s14, 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s15, s15, 16
-; GFX6-NOHSA-NEXT:    s_and_b32 s16, s16, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s17, s17, 16
+; GFX6-NOHSA-NEXT:    s_or_b32 s12, s13, s12
+; GFX6-NOHSA-NEXT:    s_or_b32 s1, s1, s14
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s13, s0, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s14, s0, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s13, s13, 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s14, s14, 0xffff
+; GFX6-NOHSA-NEXT:    s_or_b32 s13, s14, s13
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s14, s0, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s0, s0
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s14, s14, 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s18, s18, 16
-; GFX6-NOHSA-NEXT:    s_and_b32 s19, s19, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s20, s20, 16
+; GFX6-NOHSA-NEXT:    s_or_b32 s0, s0, s14
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s14, s3, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s15, s3, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s14, s14, 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s15, s15, 0xffff
+; GFX6-NOHSA-NEXT:    s_or_b32 s14, s15, s14
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s15, s3, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s3, s3
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s15, s15, 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s21, s21, 16
-; GFX6-NOHSA-NEXT:    s_and_b32 s22, s22, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s23, s23, 16
+; GFX6-NOHSA-NEXT:    s_or_b32 s3, s3, s15
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s15, s2, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s16, s2, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s15, s15, 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s16, s16, 0xffff
+; GFX6-NOHSA-NEXT:    s_or_b32 s15, s16, s15
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s16, s2, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s2, s2
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s16, s16, 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s24, s24, 16
-; GFX6-NOHSA-NEXT:    s_and_b32 s25, s25, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s26, s26, 16
+; GFX6-NOHSA-NEXT:    s_or_b32 s2, s2, s16
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s16, s5, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s17, s5, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s16, s16, 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s17, s17, 0xffff
+; GFX6-NOHSA-NEXT:    s_or_b32 s16, s17, s16
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s17, s5, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s5, s5
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s17, s17, 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s27, s27, 16
-; GFX6-NOHSA-NEXT:    s_and_b32 s28, s28, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s29, s29, 16
+; GFX6-NOHSA-NEXT:    s_or_b32 s5, s5, s17
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s17, s4, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s18, s4, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s17, s17, 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s18, s18, 0xffff
+; GFX6-NOHSA-NEXT:    s_or_b32 s17, s18, s17
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s18, s4, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s18, s18, 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s30, s30, 16
-; GFX6-NOHSA-NEXT:    s_and_b32 s31, s31, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s33, s33, 16
+; GFX6-NOHSA-NEXT:    s_or_b32 s4, s4, s18
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s18, s7, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s19, s7, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s18, s18, 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s19, s19, 0xffff
+; GFX6-NOHSA-NEXT:    s_or_b32 s18, s19, s18
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s19, s7, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s7, s7
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s19, s19, 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s34, s34, 16
-; GFX6-NOHSA-NEXT:    s_and_b32 s35, s35, 0xffff
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s36, s36, 16
+; GFX6-NOHSA-NEXT:    s_or_b32 s7, s7, s19
+; GFX6-NOHSA-NEXT:    s_ashr_i32 s19, s6, 24
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s20, s6, 0x80010
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s19, s19, 16
+; GFX6-NOHSA-NEXT:    s_and_b32 s20, s20, 0xffff
+; GFX6-NOHSA-NEXT:    s_or_b32 s19, s20, s19
+; GFX6-NOHSA-NEXT:    s_bfe_i32 s20, s6, 0x80008
+; GFX6-NOHSA-NEXT:    s_sext_i32_i8 s6, s6
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s20, s20, 16
 ; GFX6-NOHSA-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX6-NOHSA-NEXT:    s_or_b32 s12, s13, s12
-; GFX6-NOHSA-NEXT:    s_or_b32 s1, s1, s14
-; GFX6-NOHSA-NEXT:    s_or_b32 s13, s16, s15
-; GFX6-NOHSA-NEXT:    s_or_b32 s0, s0, s17
-; GFX6-NOHSA-NEXT:    s_or_b32 s14, s19, s18
-; GFX6-NOHSA-NEXT:    s_or_b32 s3, s3, s20
-; GFX6-NOHSA-NEXT:    s_or_b32 s15, s22, s21
-; GFX6-NOHSA-NEXT:    s_or_b32 s2, s2, s23
-; GFX6-NOHSA-NEXT:    s_or_b32 s16, s25, s24
-; GFX6-NOHSA-NEXT:    s_or_b32 s5, s5, s26
-; GFX6-NOHSA-NEXT:    s_or_b32 s17, s28, s27
-; GFX6-NOHSA-NEXT:    s_or_b32 s18, s31, s30
-; GFX6-NOHSA-NEXT:    s_or_b32 s7, s7, s33
-; GFX6-NOHSA-NEXT:    s_or_b32 s19, s35, s34
-; GFX6-NOHSA-NEXT:    s_or_b32 s6, s6, s36
-; GFX6-NOHSA-NEXT:    s_or_b32 s4, s4, s29
+; GFX6-NOHSA-NEXT:    s_or_b32 s6, s6, s20
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s19
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
index c119ef274bb04..35f1f3b7bc16b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC,SI-NOHSA %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC,SI-NOHSA %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=GCN-HSA,FUNC,GCNX3-HSA %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC,GCNX3-NOHSA %s
 
@@ -496,11 +496,11 @@ define amdgpu_kernel void @global_load_v11f32(ptr addrspace(1) %out, ptr addrspa
 ; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
 ; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
-; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
-; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[3:6], off, s[8:11], 0
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NOHSA-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:40
 ; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
@@ -674,14 +674,14 @@ define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspa
 ; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
 ; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
 ; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
-; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
 ; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
 ; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 73e0f6d4f1cf3..f8c8be01f2d35 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-NOHSA-SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-NOHSA-SI %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-HSA %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-NOHSA-VI %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=EG %s
@@ -628,52 +628,63 @@ entry:
 define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
 ; GCN-NOHSA-SI-LABEL: global_load_v16i16_align2:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, s10
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, s11
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s5
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s6
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s7
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:2
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v4, off, s[8:11], 0 offset:4
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 offset:6
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v5, off, s[8:11], 0 offset:8
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0 offset:10
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v6, off, s[8:11], 0 offset:12
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v7, off, s[8:11], 0 offset:14
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v8, off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v9, off, s[8:11], 0 offset:18
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v10, off, s[8:11], 0 offset:20
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v11, off, s[8:11], 0 offset:22
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v12, off, s[8:11], 0 offset:24
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v13, off, s[8:11], 0 offset:26
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v14, off, s[8:11], 0 offset:28
-; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v15, off, s[8:11], 0 offset:30
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0 offset:14
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:10
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 offset:6
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v3, off, s[4:7], 0 offset:2
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v4, off, s[4:7], 0 offset:30
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v5, off, s[4:7], 0 offset:26
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v6, off, s[4:7], 0 offset:22
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v7, off, s[4:7], 0 offset:18
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v8, off, s[4:7], 0 offset:12
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v9, off, s[4:7], 0 offset:8
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v10, off, s[4:7], 0 offset:4
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v11, off, s[4:7], 0
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v12, off, s[4:7], 0 offset:28
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v13, off, s[4:7], 0 offset:24
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v14, off, s[4:7], 0 offset:20
+; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v15, off, s[4:7], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s2
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s3
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(13)
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v2
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(12)
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(11)
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(10)
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(9)
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(8)
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v3, v0, v8
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v2, v1, v9
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(5)
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v1, v16, v10
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v0, v17, v11
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v7, v4, v12
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v6, v5, v13
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v5, v18, v14
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v3, v7, v6
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v2, v16, v5
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v1, v17, v4
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v0, v18, v0
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v7, v15, v14
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v6, v13, v12
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v5, v11, v10
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v4, v9, v8
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v4, v19, v15
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_load_v16i16_align2:
@@ -1847,14 +1858,14 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xffff, v3
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -1996,14 +2007,14 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v0, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v3, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v0, 0, 16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -2146,27 +2157,27 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v7
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v6
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v22, 0xffff, v7
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v6
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v7
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v6
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xffff, v1
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v0
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, 0xffff, v3
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v2
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v5
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v4
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v22, 0xffff, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v6
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
@@ -2378,11 +2389,16 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v1
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v7
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v6
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v18, v7, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v6, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v0
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v1, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v0, 0, 16
@@ -2390,15 +2406,10 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v2
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v14, v3, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v12, v2, 0, 16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v5
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v4
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v5, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v4, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v7
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v6
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v18, v7, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v6, 0, 16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
@@ -2622,12 +2633,18 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v2
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v13
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v34, 0xffff, v13
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v32, 0xffff, v12
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v0
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v3
@@ -2640,7 +2657,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xffff, v6
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v5
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v4
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v9
@@ -2649,17 +2665,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v10
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v30, 0xffff, v9
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v28, 0xffff, v8
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v13
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xffff, v15
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v14
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v34, 0xffff, v13
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v32, 0xffff, v12
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
@@ -3017,16 +3026,22 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v3
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v2
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v18, v3, 0, 16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v35, 16, v13
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v33, 16, v12
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v34, v13, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v32, v12, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v2, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v23, 16, v1
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v21, 16, v0
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v22, v1, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v20, v0, 0, 16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v7
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v6
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v7, 0, 16
@@ -3035,7 +3050,6 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v25, 16, v4
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v26, v5, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v24, v4, 0, 16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v11
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v10
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v11, 0, 16
@@ -3044,17 +3058,10 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v29, 16, v8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v30, v9, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v28, v8, 0, 16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v15
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v14
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v15, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v14, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v35, 16, v13
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v33, 16, v12
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v34, v13, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v32, v12, 0, 16
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
@@ -3422,20 +3429,19 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v64i16_to_v64i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s14, -1
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN-NOHSA-SI-NEXT:    s_add_u32 s12, s12, s11
-; GCN-NOHSA-SI-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT:    s_add_u32 s12, s12, s11
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
@@ -3444,20 +3450,20 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v15
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(6)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v17, 0xffff, v15
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v15, 0xffff, v14
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v16, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v17, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v13
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v12
@@ -3465,6 +3471,9 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v19, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v20, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(2)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v8
@@ -3472,15 +3481,6 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v23, 0xffff, v10
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v9
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v8
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(13)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v5
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v4
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, 0xffff, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v6
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v45, 0xffff, v5
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v43, 0xffff, v4
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(12)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
@@ -3490,7 +3490,19 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v2
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v49, 0xffff, v1
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v47, 0xffff, v0
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(11)
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v40
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v39
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v40
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v39
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v5
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v4
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, 0xffff, v7
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v6
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v45, 0xffff, v5
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v43, 0xffff, v4
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v30
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v29
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v28
@@ -3499,7 +3511,6 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v51, 0xffff, v29
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v57, 0xffff, v28
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v55, 0xffff, v27
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(10)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v34
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v33
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v32
@@ -3508,7 +3519,6 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v27, 0xffff, v33
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v61, 0xffff, v32
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v59, 0xffff, v31
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(9)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v38
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v37
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v36
@@ -3517,17 +3527,10 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v31, 0xffff, v37
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xffff, v36
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v35
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(8)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v42
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v41
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v40
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v39
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v37, 0xffff, v42
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v35, 0xffff, v41
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v40
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v39
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
@@ -4203,118 +4206,131 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v64i16_to_v64i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s14, -1
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN-NOHSA-SI-NEXT:    s_add_u32 s12, s12, s11
-; GCN-NOHSA-SI-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT:    s_add_u32 s12, s12, s11
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[23:26], off, s[8:11], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s6
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s7
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, s2
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, s3
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[40:43], off, s[4:7], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v11
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v11, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v10, 0, 16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v9
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v9, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v8, 0, 16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(6)
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v35
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v34
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v35, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v34, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v33
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v32
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v14, v33, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v12, v32, 0, 16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(5)
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v35, 16, v39
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v33, 16, v38
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v34, v39, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v32, v38, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v47, 16, v37
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v45, 16, v36
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v46, v37, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v44, v36, 0, 16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v39, 16, v43
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v37, 16, v42
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v38, v43, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v36, v42, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v51, 16, v41
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v49, 16, v40
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v50, v41, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v48, v40, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v43, 16, v31
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v41, 16, v30
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v42, v31, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v40, v30, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v55, 16, v29
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v53, 16, v28
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v54, v29, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v52, v28, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v30, 16, v27
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v28, 16, v26
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v29, v27, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v27, v26, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v59, 16, v25
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v57, 16, v24
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v58, v25, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v56, v24, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v26, 16, v23
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v24, 16, v22
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v25, v23, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v23, v22, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v63, 16, v21
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v61, 16, v20
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v62, v21, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v60, v20, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v22, 16, v19
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v20, 16, v18
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v21, v19, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v19, v18, 0, 16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v18, 16, v15
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v14
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v15, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v14, 0, 16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v16, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v17, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(2)
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v13
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v14, 16, v12
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v13, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v13, v12, 0, 16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v13, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v14, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v16, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v11
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(5)
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v10
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(4)
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v18, v11, 0, 16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v17
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v17, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v16, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v10, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v42, 16, v9
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v40, 16, v8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v41, v9, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v39, v8, 0, 16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(12)
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v3, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v50, 16, v1
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v48, 16, v0
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v49, v1, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v47, v0, 0, 16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v36
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v35
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v36, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v35, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v7
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v6
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v14, v7, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v12, v6, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v46, 16, v5
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v44, 16, v4
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v45, v5, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v43, v4, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v54, 16, v26
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v52, 16, v25
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v53, v26, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v51, v25, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v58, 16, v24
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v56, 16, v23
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v57, v24, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v55, v23, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v26, 16, v30
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v24, 16, v29
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v25, v30, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v23, v29, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v62, 16, v28
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v60, 16, v27
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v61, v28, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v59, v27, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v30, 16, v34
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v28, 16, v33
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v29, v34, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v27, v33, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v32
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v31
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v32, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v31, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v34, 16, v38
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v32, 16, v37
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v33, v38, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v31, v37, 0, 16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:144
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
@@ -5420,10 +5436,10 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -5533,8 +5549,8 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -5647,15 +5663,15 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[8:9], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v9
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v9
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -5793,13 +5809,13 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v1, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v6, 16, v1
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v1, 0, 16
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -5942,23 +5958,23 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, v4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, v4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, v4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, v4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, v4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, v4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, v4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v15, 0xffff, v0
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v11, 0xffff, v2
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v7, 0xffff, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:32
@@ -6156,46 +6172,44 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s7, v3
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s5, v1
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s7, v3
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s6, v2
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s7
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s5
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s6, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[4:5], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[6:7], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s13, s5, 31
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s15, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[10:11], 0x100000
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s20, s7, 31
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s21, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[8:9], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[14:15], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[12:13], 0x100000
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[4:5], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[6:7], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[14:15], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[12:13], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x100000
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s13
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s11
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s9
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64:
@@ -6412,50 +6426,50 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v29, 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v31, v29
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, v29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, v29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, v29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, v29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, v29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, v29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, v29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, v29
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v2
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v23, 0xffff, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v19, 0xffff, v2
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v11, 0xffff, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v15, 0xffff, v3
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v28, 0xffff, v5
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v4
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v2
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xffff, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v2
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v1
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v3
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v6
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v4
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v27, 0xffff, v4
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v31, 0xffff, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v7, 0xffff, v5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v32, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v34, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v28, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v30, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v24, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v26, v8
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v4
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v29
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
@@ -6778,87 +6792,84 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s4, v0
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s5, v1
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s6, v2
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s7, v3
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s7, v1
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s14, s9
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s6, v0
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s8, v2
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s8, v4
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s9, v5
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s11, v7
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s10, v6
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, s7
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s14, s5
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s16, s11
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s18, s9
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[28:29], s[8:9], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[30:31], s[10:11], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[34:35], s[4:5], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[6:7], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s21, s9, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s23, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[18:19], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s25, s11, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s27, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[16:17], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s5, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s38, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[14:15], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s39, s7, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s40, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[12:13], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[26:27], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[24:25], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[22:23], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[20:21], 0x100000
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s10, v4
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s11, v5
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s12, v6
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s13, v7
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s16, s7
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s39, s9, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s40, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s18, s13
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s20, s11
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s23, s11, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s25, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s27, s13, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s29, s13, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s7, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s38, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x100000
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s40
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s15
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[10:11], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[30:31], s[12:13], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[34:35], s[6:7], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[8:9], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[28:29], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[26:27], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[24:25], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[22:23], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x100000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s38
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s33
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s38
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s27
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s25
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s23
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s21
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s36
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s37
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s34
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s35
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s30
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s28
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s29
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s18
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s19
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s17
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s13
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s27
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s36
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s37
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s35
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s30
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v26, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v24, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v28, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v29, s7
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[26:29], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64:
@@ -7215,136 +7226,110 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v32i16_to_v32i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s14, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN-NOHSA-SI-NEXT:    s_add_u32 s12, s12, s11
-; GCN-NOHSA-SI-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v39, 0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[14:17], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[18:21], off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[22:25], off, s[8:11], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[26:29], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[10:13], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[14:17], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v54, 0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v50, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v52, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v43, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v45, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v47, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v56, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v40, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v36, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v38, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v54
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v27
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v35, 0xffff, v27
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v29
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v39, 0xffff, v29
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v28
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v27, 0xffff, v28
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v30
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v42, 0xffff, v30
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v20
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v14
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v14
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, v3
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v30, 0xffff, v15
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v34, 0xffff, v17
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v18
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v20
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v19
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v19
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v21
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v42, 0xffff, v21
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(5)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v22
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v46, 0xffff, v22
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v24
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v24
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v23
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v50, 0xffff, v23
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v25
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v54, 0xffff, v25
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v29
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v26
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v58, 0xffff, v26
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v28
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xffff, v28
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(1)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v27
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v27
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v38, 0xffff, v29
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v41, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v39
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v33
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v49, 0xffff, v33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v28, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v30, v54
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v10
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v10
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xffff, v14
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xffff, v17
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v31
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v46, 0xffff, v31
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v55, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v57, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v51, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v53, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v43, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v45, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v35, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v37, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v31, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v33, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v59, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v61, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v47, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v49, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v39
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v12, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v14, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(3)
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v12, off, s[12:15], 0 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(2)
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(1)
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, v54
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v39
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v39
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:144
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:224
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v49, v54
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v13
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v16
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v7, 0xffff, v12
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v11
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v21, 0xffff, v13
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v14
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v16
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v34
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v53, 0xffff, v34
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v24, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v42, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v54
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[53:56], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, v54
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, 0xffff, v15
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v31, 0xffff, v32
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v32, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v34, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, v54
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, v54
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v54
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
@@ -7914,172 +7899,178 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s10, v0
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s11, v1
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s13, v3
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s16, v4
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s17, v5
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s14, v6
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s15, v7
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s18, v12
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s19, v13
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s20, v14
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s21, v15
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s22, v8
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s23, v9
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s25, v11
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s24, v10
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s26, s13
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s15, v3
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s10, v8
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s11, v9
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s20, v10
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s21, v11
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s36, s15
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s38, s13
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s26, s21
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s30, s11
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s34, s15
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s36, s17
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s38, s21
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s40, s19
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s42, s25
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s44, s23
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s46, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s48, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s50, s16, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s52, s20, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s54, s18, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s56, s24, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s58, s22, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[22:23], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[24:25], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[18:19], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s23, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s47, s23, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s49, s25, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s51, s25, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s53, s19, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s55, s19, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s57, s21, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s59, s21, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[26:27], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[20:21], 0x100000
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s23
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s64, s17, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s65, s17, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[30:31], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[26:27], s[16:17], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s54, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[10:11], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s57, s11, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s59, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s60, s21, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s61, s21, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[20:21], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[36:37], 0x100000
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[38:39], 0x100000
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s16, v4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s20
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s20, s15, 16
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s14, v2
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s19, v7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s20, s15, 31
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s17, v5
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s22, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s20
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s20, s13, 16
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s18, v6
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s24, s17
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s64, s19, 31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s20
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s20, s13, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s65, s19, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s62, s17, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s63, s17, 16
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s21
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s16, s13, 16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s17, s13, 31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s16, s11, 16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s17
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s17, s11, 31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s16
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s66, s15, 31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s17
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s67, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[30:31], s[14:15], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[60:61], s[10:11], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[62:63], s[12:13], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[58:59], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[56:57], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[54:55], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[52:53], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[50:51], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[48:49], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[46:47], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x100000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s20
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x100000
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(1)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s34
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s35
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s67
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s66
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s36
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s37
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s23
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s65
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s64
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s35, v15
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s63
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s62
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s29, v13
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s40, s35
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x100000
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s26
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s27
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s61
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s60
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s42, s29
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s53, s35, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s55, s35, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s30
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s31
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s59
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s57
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s44, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s29, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s51, s29, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s40
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s41
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s55
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s53
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s46, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[38:39], s[14:15], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s42
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s43
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s51
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s49
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s44
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s45
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s47
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s34, v14
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s48, s18, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[12:13], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x100000
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s62
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s63
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s60
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s61
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s30
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s26
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s27
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s18
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v24, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s44
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s45
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s28, v12
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s50, s16, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s56, s34, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[34:35], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[34:35], s[18:19], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s37
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s46
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s47
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s58, s28, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[28:29], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[28:29], s[50:51], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s35
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s48
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s49
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[52:53], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s17
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s28
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s29
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[54:55], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s21
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[56:57], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s19
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[58:59], 0x100000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s25
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s23
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s21
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s17
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v26, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, s13
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s13
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 3923290976363..9b0218f5bcba4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=SI-NOHSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=SI-NOHSA %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCNX3-HSA %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNX3-NOHSA %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
@@ -976,14 +976,14 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa
 ; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
 ; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
 ; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
-; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
 ; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
 ; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
@@ -1476,14 +1476,14 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out,
 ; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
 ; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
-; SI-NOHSA-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
 ; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v4
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v5
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NOHSA-NEXT:    s_endpgm
 ;
@@ -1812,12 +1812,12 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 ; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
 ; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v7, v2
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v3, v0
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v5, v1
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16
@@ -1935,36 +1935,36 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-NOHSA-LABEL: global_zextload_v8i32_to_v8i64:
 ; SI-NOHSA:       ; %bb.0:
-; SI-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
-; SI-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NOHSA-NEXT:    s_mov_b32 s2, -1
-; SI-NOHSA-NEXT:    s_mov_b32 s10, s2
-; SI-NOHSA-NEXT:    s_mov_b32 s11, s3
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
 ; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
-; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v9, 0
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v11, v9
-; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
-; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v3
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v0
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v1
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v6
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v7
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v4
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v5
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; SI-NOHSA-NEXT:    s_endpgm
 ;
 ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64:
@@ -2132,21 +2132,21 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
 ; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
 ; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v14, 31, v3
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v5
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v16, 31, v4
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v22, 31, v7
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v6
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v19, v6
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v21, v7
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v14, 31, v3
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v5
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v16, 31, v4
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v15, v4
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v17, v5
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v11, v2
@@ -2342,37 +2342,39 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
 ; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
+; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v3
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v2
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v1
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v0
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v20, v0
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v22, v1
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v2
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v3
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v7
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v6
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v5
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v4
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v24, v4
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v26, v5
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v7
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v2
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v7
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v6
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v20, v6
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v22, v7
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v1
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v2
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v3
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v11
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v10
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v11
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v10
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v9
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v8
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v8
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v9
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v10
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v11
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v10
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v11
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v15
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v14
@@ -2382,14 +2384,12 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v34, v13
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v14
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v15
-; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
-; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
 ; SI-NOHSA-NEXT:    s_endpgm
@@ -2706,45 +2706,45 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
 ; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v5, 0
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v7, v5
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v17, 0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v19, v17
 ; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
 ; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v1
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
-; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v2
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v3
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v8
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v9
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v1
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v10
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v11
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v2
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v3
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v16
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v17
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v4
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v5
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v18
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v19
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v6
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v7
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v8
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v9
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v12
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v13
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v10
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v11
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(6) expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v12
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v13
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v14
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v15
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v14
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v15
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
 ; SI-NOHSA-NEXT:    s_endpgm
 ;
 ; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64:
@@ -3006,12 +3006,6 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-NOHSA-LABEL: global_sextload_v32i32_to_v32i64:
 ; SI-NOHSA:       ; %bb.0:
-; SI-NOHSA-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; SI-NOHSA-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; SI-NOHSA-NEXT:    s_mov_b32 s14, -1
-; SI-NOHSA-NEXT:    s_mov_b32 s15, 0xe8f000
-; SI-NOHSA-NEXT:    s_add_u32 s12, s12, s11
-; SI-NOHSA-NEXT:    s_addc_u32 s13, s13, 0
 ; SI-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NOHSA-NEXT:    s_mov_b32 s2, -1
@@ -3020,17 +3014,19 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
 ; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:96
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
+; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v47, 31, v31
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v45, 31, v30
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v11
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v10
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v39, 31, v15
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v37, 31, v14
@@ -3040,46 +3036,41 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v42, v13
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v36, v14
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v38, v15
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v29
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v28
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v32, v28
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v34, v29
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v44, v30
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v46, v31
-; SI-NOHSA-NEXT:    buffer_store_dword v44, off, s[12:15], 0 ; 4-byte Folded Spill
-; SI-NOHSA-NEXT:    buffer_store_dword v45, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; SI-NOHSA-NEXT:    buffer_store_dword v46, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; SI-NOHSA-NEXT:    buffer_store_dword v47, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(9)
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v7
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
-; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v9
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v8
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v12, v8
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v14, v9
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v32, v10
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v34, v11
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v7
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v6
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v47, 31, v5
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v45, 31, v4
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v44, v4
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v46, v5
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v12, v6
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v14, v7
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(8)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v6
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v7
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v3
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v19
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v19
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v23
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v51, 31, v1
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v49, 31, v0
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v48, v0
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v50, v1
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v2
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v3
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v19
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v18
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v55, 31, v17
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v53, 31, v16
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v52, v16
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v54, v17
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v18
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v19
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(6)
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v23
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v22
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v59, 31, v21
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v57, 31, v20
@@ -3087,47 +3078,43 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v58, v21
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v22
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v23
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v27
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v26
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v63, 31, v25
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v61, 31, v24
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v60, v24
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v62, v25
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v20, v26
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v22, v27
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4)
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v11
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v10
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v9
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v8
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v8
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v9
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v24, v10
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v26, v11
-; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
-; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v22, 31, v27
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v26
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v42, 31, v25
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v40, 31, v24
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192
-; SI-NOHSA-NEXT:    buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload
-; SI-NOHSA-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; SI-NOHSA-NEXT:    buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; SI-NOHSA-NEXT:    buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v38, 31, v31
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v36, 31, v30
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v29
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v28
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v12, v28
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v14, v29
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:32
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v35, v30
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v37, v31
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v39, v24
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v41, v25
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(1)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v19, v26
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v21, v27
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16
 ; SI-NOHSA-NEXT:    s_endpgm
 ;
 ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64:
@@ -3816,87 +3803,87 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NOHSA-NEXT:    s_mov_b32 s2, -1
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NOHSA-NEXT:    s_mov_b32 s10, s2
 ; SI-NOHSA-NEXT:    s_mov_b32 s11, s3
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
 ; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[32:35], off, s[8:11], 0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v29, 0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v31, v29
 ; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
 ; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:112
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v4
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v5
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:64
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:80
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v1
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:224
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v7
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v2
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v3
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:240
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v8
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v9
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v4
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v5
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:192
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v10
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v11
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v32
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v33
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
-; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v34
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v35
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v28
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v29
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
-; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v30
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v31
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
-; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v24
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v25
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v26
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v27
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v6
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v7
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:208
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v8
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v9
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:160
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v20
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v21
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v10
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v11
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:176
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(10) expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v12
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v13
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:128
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v22
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v23
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v14
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v15
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(11) expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v16
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v17
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v16
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v17
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v18
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v19
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(12) expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v20
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v21
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v18
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v19
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v22
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v23
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(13) expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v24
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v25
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v12
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v13
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v26
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v27
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:48
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v32
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v33
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v14
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v15
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v34
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v35
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:16
 ; SI-NOHSA-NEXT:    s_endpgm
 ;
 ; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64:
@@ -4373,30 +4360,32 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
 ; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
 ; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
 ; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
-; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
 ; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
 ; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
-; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4)
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:96
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:112
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4)
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:64
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:80
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0 offset:32
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:48
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(6)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:96
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:112
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(6)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:64
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:80
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:48
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:16
 ; SI-NOHSA-NEXT:    s_endpgm
 ;
 ; GCNX3-HSA-LABEL: global_load_v32i32:
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index 2c79134f785bc..f074770ad6e92 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-NOHSA-SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-NOHSA-SI %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-HSA %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-NOHSA-VI %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=EG %s
@@ -1726,14 +1726,14 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i32(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v1, v8, 8, 8
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v5, v9, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xff, v8
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v2, v8, 16, 8
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xff, v9
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v6, v9, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v1, v8, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xff, v8
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v2, v8, 16, 8
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -2038,22 +2038,22 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v3
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v17, v3, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xff, v3
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v18, v3, 16, 8
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v1
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v9, v1, 8, 8
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v13, v2, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v3
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v17, v3, 8, 8
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v6, v0, 16, 8
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xff, v1
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v10, v1, 16, 8
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xff, v2
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v14, v2, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xff, v3
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v18, v3, 16, 8
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
@@ -2264,6 +2264,10 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v19, 24, v3
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v18, v3, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v3, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v3, 0, 8
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 24, v0
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v0, 16, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v5, v0, 8, 8
@@ -2276,10 +2280,6 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v14, v2, 16, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v13, v2, 8, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v12, v2, 0, 8
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v19, 24, v3
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v18, v3, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v3, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v3, 0, 8
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
@@ -2506,8 +2506,15 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v35, 24, v7
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v33, v7, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v32, 0xff, v7
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v34, v7, 16, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v9, v0, 8, 8
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v1
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v13, v1, 8, 8
@@ -2523,25 +2530,18 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v18, v2, 16, 8
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xff, v3
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v22, v3, 16, 8
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v4
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v1, v4, 8, 8
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v27, 24, v5
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v25, v5, 8, 8
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v6
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v29, v6, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v35, 24, v7
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v33, v7, 8, 8
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xff, v4
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v2, v4, 16, 8
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xff, v5
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v26, v5, 16, 8
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v28, 0xff, v6
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v30, v6, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v32, 0xff, v7
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v34, v7, 16, 8
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
@@ -2889,8 +2889,15 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 24, v0
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v35, 24, v7
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v34, v7, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v33, v7, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v32, v7, 0, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v0, 16, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v9, v0, 8, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v0, 0, 8
@@ -2906,7 +2913,6 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v22, v3, 16, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v21, v3, 8, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v20, v3, 0, 8
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 24, v4
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v4, 16, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v1, v4, 8, 8
@@ -2919,12 +2925,6 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v30, v6, 16, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v29, v6, 8, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v28, v6, 0, 8
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v35, 24, v7
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v34, v7, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v33, v7, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v32, v7, 0, 8
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
@@ -3312,113 +3312,111 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out
 define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v64i8_to_v64i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s14, -1
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN-NOHSA-SI-NEXT:    s_add_u32 s12, s12, s11
-; GCN-NOHSA-SI-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT:    s_add_u32 s12, s12, s11
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v13
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v5, v13, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v12
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v9, v12, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v15
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v17, v15, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v14
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v21, v14, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xff, v13
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v6, v13, 16, 8
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xff, v12
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v10, v12, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xff, v15
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v18, v15, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xff, v14
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v22, v14, 16, 8
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(6)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v1
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v13, v1, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v35, 24, v0
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v33, v0, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v39, 24, v3
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v37, v3, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v43, 24, v2
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v41, v2, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xff, v1
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v14, v1, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v32, 0xff, v0
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v34, v0, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v36, 0xff, v3
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v38, v3, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v40, 0xff, v2
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v42, v2, 16, 8
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v25
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v5, v25, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v47, 24, v24
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v45, v24, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v51, 24, v27
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v49, v27, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v55, 24, v26
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v53, v26, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xff, v25
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v6, v25, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v44, 0xff, v24
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v46, v24, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v48, 0xff, v27
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v50, v27, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v52, 0xff, v26
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v54, v26, 16, 8
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v27, 24, v29
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v25, v29, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v59, 24, v28
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v57, v28, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v63, 24, v31
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v61, v31, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v30
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v1, v30, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xff, v29
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v26, v29, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v56, 0xff, v28
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v58, v28, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v60, 0xff, v31
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v62, v31, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xff, v30
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v2, v30, 16, 8
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v17
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v1, v17, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xff, v17
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v2, v17, 16, 8
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v38
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v1, v38, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xff, v38
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v2, v38, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v16
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v5, v16, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v19
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v9, v19, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v18
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v13, v18, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xff, v16
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v6, v16, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xff, v19
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v10, v19, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xff, v18
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v14, v18, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v29
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v17, v29, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v28
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v21, v28, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v27, 24, v31
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v25, v31, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v43, 24, v30
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v41, v30, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xff, v29
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v18, v29, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xff, v28
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v22, v28, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xff, v31
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v26, v31, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v40, 0xff, v30
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v42, v30, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v29, v33, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v47, 24, v32
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v45, v32, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v51, 24, v35
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v49, v35, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v55, 24, v34
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v53, v34, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v28, 0xff, v33
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v30, v33, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v44, 0xff, v32
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v46, v32, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v48, 0xff, v35
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v50, v35, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v52, 0xff, v34
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v54, v34, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v35, 24, v37
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v33, v37, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v59, 24, v36
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v57, v36, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v63, 24, v39
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v61, v39, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v32, 0xff, v37
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v34, v37, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v56, 0xff, v36
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v58, v36, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v60, 0xff, v39
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v62, v39, 16, 8
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
@@ -4053,26 +4051,26 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out
 define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v64i8_to_v64i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s14, -1
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN-NOHSA-SI-NEXT:    s_add_u32 s12, s12, s11
-; GCN-NOHSA-SI-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT:    s_add_u32 s12, s12, s11
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[10:13], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[26:29], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[30:33], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[34:37], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s6
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s7
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, s2
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, s3
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[10:13], off, s[4:7], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 24, v11
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v11, 16, 8
@@ -4082,6 +4080,11 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 24, v36
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v36, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v1, v36, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v36, 0, 8
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 24, v10
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v10, 16, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v5, v10, 8, 8
@@ -4094,70 +4097,62 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v14, v12, 16, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v13, v12, 8, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v12, v12, 0, 8
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(6)
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v31, 24, v17
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v30, v17, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v29, v17, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v28, v17, 0, 8
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v35, 24, v16
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v34, v16, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v33, v16, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v32, v16, 0, 8
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v39, 24, v19
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v38, v19, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v37, v19, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v36, v19, 0, 8
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v43, 24, v18
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v42, v18, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v41, v18, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v40, v18, 0, 8
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(5)
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v19, 24, v21
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v18, v21, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v21, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v21, 0, 8
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v47, 24, v20
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v46, v20, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v45, v20, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v44, v20, 0, 8
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v51, 24, v23
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v50, v23, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v49, v23, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v48, v23, 0, 8
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v55, 24, v22
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v54, v22, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v53, v22, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v52, v22, 0, 8
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v23, 24, v25
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v22, v25, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v21, v25, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v20, v25, 0, 8
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v59, 24, v24
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v58, v24, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v57, v24, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v56, v24, 0, 8
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v63, 24, v27
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v62, v27, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v61, v27, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v60, v27, 0, 8
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 24, v26
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v26, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v1, v26, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v26, 0, 8
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v19, 24, v27
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v18, v27, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v27, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v27, 0, 8
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v23, 24, v26
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v22, v26, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v21, v26, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v20, v26, 0, 8
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v27, 24, v29
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v26, v29, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v25, v29, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v24, v29, 0, 8
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v41, 24, v28
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v40, v28, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v39, v28, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v38, v28, 0, 8
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v45, 24, v31
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v44, v31, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v43, v31, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v42, v31, 0, 8
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v49, 24, v30
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v48, v30, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v47, v30, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v46, v30, 0, 8
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v31, 24, v33
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v30, v33, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v29, v33, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v28, v33, 0, 8
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v53, 24, v32
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v52, v32, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v51, v32, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v50, v32, 0, 8
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v57, 24, v35
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v56, v35, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v55, v35, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v54, v35, 0, 8
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v61, 24, v34
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v60, v34, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v59, v34, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v58, v34, 0, 8
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v35, 24, v37
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v34, v37, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v33, v37, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v32, v37, 0, 8
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:144
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
@@ -5280,10 +5275,10 @@ define amdgpu_kernel void @global_zextload_v2i8_to_v2i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -5412,8 +5407,8 @@ define amdgpu_kernel void @global_sextload_v2i8_to_v2i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -5542,10 +5537,10 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v1
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v6, v0, 8, 8
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
@@ -5695,14 +5690,14 @@ define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v4, 0, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v3, 0, 8
-; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -5852,23 +5847,23 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[16:17], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, v1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, v1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v16
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 24, v17
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v0, v17, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v16
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v10, v17, 8, 8
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v14, v16, 8, 8
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xff, v16
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xff, v17
 ; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v4, v16, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v0, v17, 16, 8
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
@@ -6070,50 +6065,49 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s5, v1
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s6, s5, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s8, s5, 8
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s5
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s4, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s4, 24
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s16, s4, 8
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[4:5], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x80000
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s17, s5, 31
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s20, s5, 24
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[4:5], 0x80000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[16:17], 0x80000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x80000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x80000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x80000
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s18
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s19
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s9
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(1)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s5
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_sextload_v8i8_to_v8i64:
@@ -6357,49 +6351,49 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v9, v0, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v13, v1, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v11, 0xff, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v7, 0xff, v0
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v17, v3, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v15, 0xff, v3
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v21, v2, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v19, 0xff, v2
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v25, 24, v1
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v23, v1, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v29, 24, v0
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v27, v0, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v3
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v31, v3, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v3, v2, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, v4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v32, v4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v34, v4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v28, v4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v30, v4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v24, v4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v26, v4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, v4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, v4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, v4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, v4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, v4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, v4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, v4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v33, 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v35, v33
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v29, v33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v31, v33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, v33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, v33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, v33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, v33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, v33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, v33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, v33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, v33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v33
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v34, 24, v2
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v32, v2, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v6, v3, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v10, v0, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v14, v1, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xff, v1
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xff, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xff, v3
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v18, v2, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xff, v2
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v22, 24, v1
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v20, v1, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v0
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v24, v0, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v30, 24, v3
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v28, v3, 16, 8
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_zextload_v16i8_to_v16i64:
@@ -6719,91 +6713,90 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s13, s11
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s15, s11
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s17, s11
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s13, 0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s15, s13
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s17, s13
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s19, s13
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s7, v3
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s6, v2
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s7, 16
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s5, v1
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s8, v2
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s9, v3
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s10, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s9, 8
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s18, s9
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s8, 24
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s8, 8
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s16, s5, 8
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s26, s5
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s7, 8
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s20, s7
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s6, 24
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s6, 8
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s35, s7, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s38, s7, 24
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s16, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s5, 8
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s28, s5
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s10, s4, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s4, 24
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s4, 8
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[4:5], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[8:9], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[6:7], 0x80000
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s31, s5, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s5, 24
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s35, s9, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s38, s9, 24
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[34:35], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[30:31], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x80000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x80000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x80000
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s38
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s35
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s18
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s36
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s37
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s33
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s26
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s27
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[4:5], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s5, 24
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[34:35], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[30:31], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s36
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s37
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s15
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s13
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s21
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s23
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s25
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s26
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s27
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s17
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s17
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s28
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s29
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s9
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s28
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s19
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s5
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_sextload_v16i8_to_v16i64:
@@ -7205,12 +7198,6 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
 define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v32i8_to_v32i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s14, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN-NOHSA-SI-NEXT:    s_add_u32 s12, s12, s11
-; GCN-NOHSA-SI-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
@@ -7219,137 +7206,98 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[10:13], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[17:20], off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v2, v12, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v3, v11, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v6, v10, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xff, v10
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v1, 0xff, v11
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, 0xff, v12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, v2
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v55, v13, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v53, 0xff, v13
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v27, 24, v10
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v25, v10, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v11
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v29, v11, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v35, 24, v12
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v33, v12, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v39, 24, v13
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v37, v13, 16, 8
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(8)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v10, 24, v20
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v43, v17, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v41, 0xff, v17
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v47, v18, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v45, 0xff, v18
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v51, v19, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v49, 0xff, v19
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v23, v20, 8, 8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v21, 0xff, v20
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v59, 24, v17
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v57, v17, 16, 8
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(5)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v18
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v4, v18, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 24, v19
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v0, v19, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v8, v20, 16, 8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v58, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v60, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v38, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v40, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v34, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v36, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v30, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v32, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v26, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v28, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v24, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v50, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v52, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v46, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v48, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v42, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v44, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v54, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v56, v9
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v53, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v54, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v55, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v56, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, v9
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v12, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(3)
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(2)
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1) expcnt(1)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, v14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, v12
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v9
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v53, off, s[12:15], 0 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v54, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v55, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v56, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v54, v9
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v56, v9
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[3:6], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v54, 0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[57:60], off, s[0:3], 0 offset:144
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:224
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v50, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v52, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v47, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v29, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v31, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v26, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v44, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v41, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v37, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v39, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v33, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v35, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v56, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, v54
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v13, v5, 8, 8
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v51, 24, v8
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v49, v8, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v2, v3, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xff, v3
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v11, 0xff, v5
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v21, v6, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v19, 0xff, v6
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v24, 24, v3
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v22, v3, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v30, 24, v5
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v28, v5, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 24, v6
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v3, v6, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v34, v7, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v32, 0xff, v7
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v38, v8, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v36, 0xff, v8
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v42, v9, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v40, 0xff, v9
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v48, 24, v7
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v46, v7, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v9
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v6, v9, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, v54
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[53:56], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v49, v54
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v17, v4, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v15, 0xff, v4
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v27, 24, v4
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v25, v4, 16, 8
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v45, v10, 8, 8
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v43, 0xff, v10
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, v54
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(1)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, v54
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v28, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v46, v54
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, v54
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v43, v54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v54
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v55, 24, v10
+; GCN-NOHSA-SI-NEXT:    v_bfe_u32 v53, v10, 16, 8
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, v54
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[53:56], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_zextload_v32i8_to_v32i64:
@@ -7910,179 +7858,178 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s29, 0
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s29, 0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s27, s29
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s23, s29
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s29
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, s29
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s29
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s29
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s13, s29
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s15, s29
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s17, s29
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s38, v0
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s39, v1
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s40, v2
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s41, v3
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s20, v4
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s21, v5
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s24, v6
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s25, v7
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s41, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s41, 8
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s41
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s36, s40, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s40, 24
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s40, 8
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s6, s39, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s8, s39, 8
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, s39
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s4, s38, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s38, 24
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s38, 8
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[42:43], s[10:11], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s10, s25, 16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s42
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s43
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s5, s41, 31
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s14, s41, 24
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[42:43], s[12:13], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s25, 8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s40
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s41
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s40, s25
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s42
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s43
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s42, s24, 16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s14
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s19, s39, 31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s5
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s5, s39, 24
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x80000
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s43, v3
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s5, v1
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s43, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s10, s5, 8
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s6, s43, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s8, s43, 24
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s38
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s39
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s38, s24, 24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s28
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s29
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s24, 8
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s21, 16
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s4, v0
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s42, v2
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s43, 8
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, s43
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s30, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s28
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s29
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s16, s21, 8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s23
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s22, s21
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x80000
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s38, s42, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s36, s42, 24
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s42, 8
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s16, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s4, 24
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[44:45], s[12:13], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s23
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[30:31], 0x80000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s36
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s37
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s36, s20, 16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s10, s5, 24
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s24, v4
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s25, v5
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s21, v7
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s40, s4, 8
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x80000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s44
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s45
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s26
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s27
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x80000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x80000
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s34
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s35
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s20, 24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s30
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s31
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s44, s20, 8
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s10
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s10, s5, 31
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s20, v6
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s6, s21, 16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s42
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s43
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s12, s25, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s14, s25, 8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s38
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s39
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s38, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s36
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s37
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s36, s24, 16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s35
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s24, 24
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s46, s24, 8
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s25, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s43, s25, 24
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[24:25], 0x80000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s10
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[4:5], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x80000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s5, s25, 31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s18
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s19
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s19, s25, 24
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s8, s21, 8
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s42, s21
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s45, s21, 31
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s47, s21, 24
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s19
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s33, s21, 31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s5
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s39, s21, 24
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[30:31], s[20:21], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[46:47], s[24:25], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[48:49], s[22:23], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[44:45], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[34:35], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[36:37], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s40
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s41
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s20, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s44, s20, 24
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[34:35], 0x80000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[34:35], s[38:39], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[42:43], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[38:39], s[42:43], 0x80000
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s40
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s41
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s46
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s47
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s9
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s5
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s18
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s19
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:144
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s39
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s33
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s26
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s27
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s47
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s45
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s20, 8
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[30:31], s[36:37], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[36:37], s[44:45], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s48
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s49
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s11
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s9
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s30
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s31
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s13
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s28
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s29
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s36
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s37
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s34
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s35
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s28
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s26
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s27
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x80000
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s17
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(2)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s25
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s23
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s43
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s35
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[46:47], 0x80000
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s30
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s11
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s21
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_sextload_v32i8_to_v32i64:
@@ -9551,8 +9498,8 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xff00, v0
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v3, 0xff, v0
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GCN-NOHSA-SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xff00ff, v0
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
@@ -9908,28 +9855,28 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s6, v1
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s1, s0, 24
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s5, s4, 24
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s7, s6, 24
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s0, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s4, 0xff00
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s6, 0xff00
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s6, 0xff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s0, 0xff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s4, 0xff
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s9, s9, 8
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s8, s8, 8
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s7, s10, s9
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s8, s11, s8
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s5, s10, s9
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s7, s11, s8
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xff00ff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s0, 0xff00ff
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s6
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -10178,34 +10125,34 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i16(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s1, v1
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s6, s1, 24
-; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s7, s1, 0x80010
-; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s8, s1, 0x80008
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s1, s1
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s9, s0, 24
-; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s10, s0, 0x80010
-; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s11, s0, 0x80008
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s0, s0
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s4, v0
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s5, v1
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s6, s5, 24
+; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s7, s5, 0x80010
+; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s8, s5, 0x80008
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s5, s5
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s9, s4, 24
+; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s10, s4, 0x80010
+; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s11, s4, 0x80008
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s4, s4
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s8, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s1, s1, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s9, s9, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s11, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s0, s0, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s6, s7, s6
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s7, s1, s8
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s8, s10, s9
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s9, s0, s11
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s5, s5, s8
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s7, s10, s9
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s4, s4, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s6
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
@@ -10499,49 +10446,48 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s4, v2
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s6, v3
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s8, v0
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s4, v2
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s6, v3
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s9, s8, 24
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s11, s10, 24
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s5, s4, 24
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s7, s6, 24
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s4, 0xff00
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s6, 0xff00
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s9, s8, 24
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s11, s10, 24
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s8, 0xff00
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s10, 0xff00
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s16, s10, 0xff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s17, s8, 0xff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s18, s6, 0xff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s19, s4, 0xff
-; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s15, s15, 8
-; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s14, s14, 8
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s18, s6, 0xff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s19, s4, 0xff
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s9, s13, 8
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s11, s12, 8
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s5, s16, s15
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s7, s17, s14
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s15, s15, 8
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s14, s14, 8
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s9, s18, s9
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s11, s19, s11
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xff00ff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s5, s16, s15
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s7, s17, s14
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xff00ff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xff00ff
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s10
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s10
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_zextload_v16i8_to_v16i16:
@@ -10977,8 +10923,6 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s7, s7
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s11, s6, 24
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s12, s6, 0x80010
-; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s13, s6, 0x80008
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s6, s6
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s14, s5, 24
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s15, s5, 0x80010
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s16, s5, 0x80008
@@ -10987,14 +10931,14 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s18, s4, 0x80010
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s19, s4, 0x80008
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s4, s4
+; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s13, s6, 0x80008
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s6, s6
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s8, s8, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s10, s10, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s11, s11, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s13, s13, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s14, s14, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s16, s16, 16
@@ -11003,6 +10947,8 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s18, s18, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s13, s13, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s8, s9, s8
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s7, s7, s10
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s9, s12, s11
@@ -11015,13 +10961,12 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s8
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s8
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_sextload_v16i8_to_v16i16:
@@ -11535,59 +11480,63 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s4, v2
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s6, v3
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s8, v0
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s4, v2
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s6, v3
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s12, v6
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s14, v7
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s16, v4
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s18, v5
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s9, s8, 24
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s11, s10, 24
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s5, s4, 24
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s7, s6, 24
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s20, s4, 0xff00
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s21, s6, 0xff00
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s9, s8, 24
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s11, s10, 24
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s22, s8, 0xff00
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s23, s10, 0xff00
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s13, s12, 24
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s15, s14, 24
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s24, s12, 0xff00
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s25, s14, 0xff00
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s17, s16, 24
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s18, 24
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s26, s16, 0xff00
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s27, s18, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s33, s10, 0xff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s34, s8, 0xff
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s24, s12, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s25, s14, 0xff00
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s28, s18, 0xff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s29, s16, 0xff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s30, s14, 0xff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s31, s12, 0xff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s33, s10, 0xff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s34, s8, 0xff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s35, s6, 0xff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s36, s4, 0xff
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s27, s27, 8
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s26, s26, 8
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[18:19], s[18:19], 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[16:17], s[16:17], 16
-; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s17, s25, 8
-; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s19, s24, 8
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[14:15], s[14:15], 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[12:13], s[12:13], 16
-; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s13, s23, 8
-; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s15, s22, 8
-; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], 16
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s9, s21, 8
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s11, s20, 8
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s17, s25, 8
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s19, s24, 8
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s13, s23, 8
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s15, s22, 8
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s5, s28, s27
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s7, s29, s26
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s18, s18, 0xff00ff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s16, s16, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s9, s35, s9
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s11, s36, s11
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xff00ff
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s17, s30, s17
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s19, s31, s19
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, 0xff00ff
@@ -11595,34 +11544,27 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s13, s33, s13
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s15, s34, s15
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xff00ff
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s9, s35, s9
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s11, s36, s11
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xff00ff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xff00ff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xff00ff
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s14
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s15
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s13
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s10
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s14
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s18
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_zextload_v32i8_to_v32i16:
@@ -12402,30 +12344,46 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s4, v2
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s5, v3
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s6, v0
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s7, v1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s8, v6
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s9, v7
-; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s10, v4
 ; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s11, v5
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s18, s9, 24
+; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s19, s9, 0x80010
+; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s20, s9, 0x80008
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s9, s9
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s12, s11, 24
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s13, s11, 0x80010
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s14, s11, 0x80008
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s11, s11
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s20, s20, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s34, s4, 24
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s14, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s18, s18, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s19, s19, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s9, s9, s20
+; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s20, s4, 0x80010
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s5, v3
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s11, s11, s14
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s14, s19, s18
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s19, s34, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s20, s20, 0xffff
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s6, v0
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s7, v1
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s10, v4
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s30, s5, 24
+; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s31, s5, 0x80010
+; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s33, s5, 0x80008
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s5, s5
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s19, s20, s19
+; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s20, s4, 0x80008
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s4, s4
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s8, v6
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s15, s10, 24
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s16, s10, 0x80010
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s17, s10, 0x80008
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s10, s10
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s18, s9, 24
-; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s19, s9, 0x80010
-; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s20, s9, 0x80008
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s9, s9
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s21, s8, 24
-; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s22, s8, 0x80010
-; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s23, s8, 0x80008
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s8, s8
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s24, s7, 24
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s25, s7, 0x80010
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s26, s7, 0x80008
@@ -12434,30 +12392,22 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s28, s6, 0x80010
 ; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s29, s6, 0x80008
 ; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s6, s6
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s30, s5, 24
-; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s31, s5, 0x80010
-; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s33, s5, 0x80008
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s5, s5
-; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s34, s4, 24
-; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s35, s4, 0x80010
-; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s36, s4, 0x80008
-; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s4, s4
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s30, s30, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s31, s31, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s33, s33, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s20, s20, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_ashr_i32 s21, s8, 24
+; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s22, s8, 0x80010
+; GCN-NOHSA-SI-NEXT:    s_bfe_i32 s23, s8, 0x80008
+; GCN-NOHSA-SI-NEXT:    s_sext_i32_i8 s8, s8
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s12, s12, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s13, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s14, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s15, s15, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s16, s16, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s17, s17, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s18, s18, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s19, s19, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s20, s20, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s21, s21, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s22, s22, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s23, s23, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s24, s24, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s25, s25, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s26, s26, 16
@@ -12466,34 +12416,26 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s28, s28, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s29, s29, 16
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s30, s30, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s31, s31, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s33, s33, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s34, s34, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s35, s35, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s36, s36, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s18, s31, s30
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s5, s5, s33
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s4, s4, s20
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s21, s21, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s22, s22, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s23, s23, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s12, s13, s12
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s11, s11, s14
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s13, s16, s15
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s10, s10, s17
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s14, s19, s18
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s9, s9, s20
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s15, s22, s21
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s8, s8, s23
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s16, s25, s24
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s7, s7, s26
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s17, s28, s27
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s18, s31, s30
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s5, s5, s33
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s19, s35, s34
-; GCN-NOHSA-SI-NEXT:    s_or_b32 s4, s4, s36
 ; GCN-NOHSA-SI-NEXT:    s_or_b32 s6, s6, s29
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s19
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s18
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s15, s22, s21
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s8, s8, s23
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index fd5710edb1257..c4c15e0095c85 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=VI,VI-NO-DS128 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX9,GFX9-NO-DS128 %s
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG %s
@@ -12,9 +12,9 @@ define amdgpu_kernel void @local_load_i16(ptr addrspace(3) %out, ptr addrspace(3
 ; SI-LABEL: local_load_i16:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_u16 v0, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -63,9 +63,9 @@ define amdgpu_kernel void @local_load_v2i16(ptr addrspace(3) %out, ptr addrspace
 ; SI-LABEL: local_load_v2i16:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_b32 v0, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -114,9 +114,9 @@ define amdgpu_kernel void @local_load_v3i16(ptr addrspace(3) %out, ptr addrspace
 ; SI-LABEL: local_load_v3i16:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_b64 v[0:1], v0
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -183,9 +183,9 @@ define amdgpu_kernel void @local_load_v4i16(ptr addrspace(3) %out, ptr addrspace
 ; SI-LABEL: local_load_v4i16:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_b64 v[0:1], v0
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -241,9 +241,9 @@ define amdgpu_kernel void @local_load_v8i16(ptr addrspace(3) %out, ptr addrspace
 ; SI-LABEL: local_load_v8i16:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
 ; SI-NEXT:    v_mov_b32_e32 v4, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -336,9 +336,9 @@ define amdgpu_kernel void @local_load_v16i16(ptr addrspace(3) %out, ptr addrspac
 ; SI-LABEL: local_load_v16i16:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v4, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
 ; SI-NEXT:    ds_read2_b64 v[4:7], v4 offset1:1
 ; SI-NEXT:    v_mov_b32_e32 v8, s0
@@ -474,9 +474,9 @@ define amdgpu_kernel void @local_zextload_i16_to_i32(ptr addrspace(3) %out, ptr
 ; SI-LABEL: local_zextload_i16_to_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_u16 v0, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -525,9 +525,9 @@ define amdgpu_kernel void @local_sextload_i16_to_i32(ptr addrspace(3) %out, ptr
 ; SI-LABEL: local_sextload_i16_to_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_i16 v0, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -578,9 +578,9 @@ define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(ptr addrspace(3) %out,
 ; SI-LABEL: local_zextload_v1i16_to_v1i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_u16 v0, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -629,9 +629,9 @@ define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(ptr addrspace(3) %out,
 ; SI-LABEL: local_sextload_v1i16_to_v1i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_i16 v0, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -682,14 +682,14 @@ define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(ptr addrspace(3) %out,
 ; SI-LABEL: local_zextload_v2i16_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_b32 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    ds_write_b64 v2, v[0:1]
 ; SI-NEXT:    s_endpgm
 ;
@@ -745,14 +745,14 @@ define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(ptr addrspace(3) %out,
 ; SI-LABEL: local_sextload_v2i16_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_b32 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
 ; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    ds_write_b64 v2, v[0:1]
 ; SI-NEXT:    s_endpgm
 ;
@@ -810,9 +810,9 @@ define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(ptr addrspace(3)
 ; SI-LABEL: local_local_zextload_v3i16_to_v3i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_b64 v[0:1], v0
 ; SI-NEXT:    v_mov_b32_e32 v4, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -886,9 +886,9 @@ define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(ptr addrspace(3)
 ; SI-LABEL: local_local_sextload_v3i16_to_v3i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_b64 v[0:1], v0
 ; SI-NEXT:    v_mov_b32_e32 v4, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -966,9 +966,9 @@ define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(ptr addrspace(3)
 ; SI-LABEL: local_local_zextload_v4i16_to_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_b64 v[0:1], v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
@@ -1078,9 +1078,9 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(ptr addrspace(3) %out,
 ; SI-LABEL: local_sextload_v4i16_to_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_b64 v[0:1], v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
@@ -1193,22 +1193,22 @@ define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(ptr addrspace(3) %out,
 ; SI-LABEL: local_zextload_v8i16_to_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
-; SI-NEXT:    v_mov_b32_e32 v12, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v0
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v1
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v2
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v3
-; SI-NEXT:    ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3
-; SI-NEXT:    ds_write2_b64 v12, v[4:5], v[6:7] offset1:1
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; SI-NEXT:    ds_write2_b64 v0, v[8:9], v[10:11] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v0, v[4:5], v[6:7] offset1:1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-NO-DS128-LABEL: local_zextload_v8i16_to_v8i32:
@@ -1354,22 +1354,22 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(ptr addrspace(3) %out,
 ; SI-LABEL: local_sextload_v8i16_to_v8i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
-; SI-NEXT:    v_mov_b32_e32 v12, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
-; SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
 ; SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
 ; SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
 ; SI-NEXT:    v_bfe_i32 v4, v0, 0, 16
-; SI-NEXT:    v_bfe_i32 v6, v1, 0, 16
 ; SI-NEXT:    v_bfe_i32 v8, v2, 0, 16
 ; SI-NEXT:    v_bfe_i32 v10, v3, 0, 16
-; SI-NEXT:    ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3
-; SI-NEXT:    ds_write2_b64 v12, v[4:5], v[6:7] offset1:1
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; SI-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; SI-NEXT:    ds_write2_b64 v0, v[8:9], v[10:11] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v0, v[4:5], v[6:7] offset1:1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-NO-DS128-LABEL: local_sextload_v8i16_to_v8i32:
@@ -1520,9 +1520,9 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out
 ; SI-LABEL: local_zextload_v16i16_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v4, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
 ; SI-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
 ; SI-NEXT:    s_waitcnt lgkmcnt(1)
@@ -1531,7 +1531,6 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v4
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v1
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v0
@@ -1539,11 +1538,12 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v5
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v4
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v7
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v6
 ; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v5
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v5
 ; SI-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7
 ; SI-NEXT:    ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5
 ; SI-NEXT:    ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3
@@ -1789,9 +1789,9 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out
 ; SI-LABEL: local_sextload_v16i16_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v4, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
 ; SI-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
 ; SI-NEXT:    s_waitcnt lgkmcnt(1)
@@ -1800,7 +1800,6 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out
 ; SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v3
 ; SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v5
 ; SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v4
 ; SI-NEXT:    v_bfe_i32 v8, v1, 0, 16
 ; SI-NEXT:    v_bfe_i32 v10, v0, 0, 16
@@ -1808,11 +1807,12 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out
 ; SI-NEXT:    v_bfe_i32 v14, v2, 0, 16
 ; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v7
 ; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v6
-; SI-NEXT:    v_bfe_i32 v16, v5, 0, 16
 ; SI-NEXT:    v_bfe_i32 v18, v4, 0, 16
 ; SI-NEXT:    v_bfe_i32 v0, v7, 0, 16
 ; SI-NEXT:    v_bfe_i32 v2, v6, 0, 16
 ; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v5
+; SI-NEXT:    v_bfe_i32 v16, v5, 0, 16
 ; SI-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7
 ; SI-NEXT:    ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5
 ; SI-NEXT:    ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3
@@ -2068,58 +2068,58 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(ptr addrspace(3) %out
 ; SI-LABEL: local_zextload_v32i16_to_v32i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v12, s1
 ; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_read2_b64 v[0:3], v12 offset1:1
-; SI-NEXT:    ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
-; SI-NEXT:    ds_read2_b64 v[8:11], v12 offset0:4 offset1:5
-; SI-NEXT:    ds_read2_b64 v[12:15], v12 offset0:6 offset1:7
-; SI-NEXT:    s_waitcnt lgkmcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v2
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v1
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v0
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v3
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v2
-; SI-NEXT:    s_waitcnt lgkmcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v5
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v9
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v11
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v13
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
-; SI-NEXT:    v_and_b32_e32 v30, 0xffff, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT:    v_mov_b32_e32 v24, s1
+; SI-NEXT:    ds_read2_b64 v[0:3], v24 offset1:1
+; SI-NEXT:    ds_read2_b64 v[4:7], v24 offset0:2 offset1:3
 ; SI-NEXT:    v_mov_b32_e32 v32, s0
-; SI-NEXT:    ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15
-; SI-NEXT:    ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13
-; SI-NEXT:    ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11
-; SI-NEXT:    ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9
-; SI-NEXT:    ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7
-; SI-NEXT:    ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5
-; SI-NEXT:    ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3
-; SI-NEXT:    ds_write2_b64 v32, v[18:19], v[16:17] offset1:1
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v5
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v3
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v7
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v5
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v4
+; SI-NEXT:    ds_read2_b64 v[0:3], v24 offset0:4 offset1:5
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v6
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v6
+; SI-NEXT:    ds_read2_b64 v[4:7], v24 offset0:6 offset1:7
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v7
+; SI-NEXT:    v_and_b32_e32 v30, 0xffff, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v3
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v5
+; SI-NEXT:    v_and_b32_e32 v28, 0xffff, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT:    ds_write2_b64 v32, v[6:7], v[30:31] offset0:14 offset1:15
+; SI-NEXT:    ds_write2_b64 v32, v[4:5], v[28:29] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11
+; SI-NEXT:    ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v32, v[22:23], v[20:21] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v32, v[18:19], v[16:17] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v32, v[14:15], v[12:13] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v32, v[10:11], v[8:9] offset1:1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-NO-DS128-LABEL: local_zextload_v32i16_to_v32i32:
@@ -2550,58 +2550,58 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(ptr addrspace(3) %out
 ; SI-LABEL: local_sextload_v32i16_to_v32i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v12, s1
 ; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_read2_b64 v[0:3], v12 offset1:1
-; SI-NEXT:    ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
-; SI-NEXT:    ds_read2_b64 v[8:11], v12 offset0:4 offset1:5
-; SI-NEXT:    ds_read2_b64 v[12:15], v12 offset0:6 offset1:7
-; SI-NEXT:    s_waitcnt lgkmcnt(3)
-; SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v1
-; SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v0
-; SI-NEXT:    v_ashrrev_i32_e32 v21, 16, v3
-; SI-NEXT:    v_ashrrev_i32_e32 v23, 16, v2
-; SI-NEXT:    v_bfe_i32 v16, v1, 0, 16
-; SI-NEXT:    v_bfe_i32 v18, v0, 0, 16
-; SI-NEXT:    v_bfe_i32 v20, v3, 0, 16
-; SI-NEXT:    v_bfe_i32 v22, v2, 0, 16
-; SI-NEXT:    s_waitcnt lgkmcnt(2)
-; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v5
-; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v4
-; SI-NEXT:    v_bfe_i32 v0, v5, 0, 16
-; SI-NEXT:    v_bfe_i32 v2, v4, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v7
-; SI-NEXT:    v_bfe_i32 v4, v7, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v6
-; SI-NEXT:    v_bfe_i32 v6, v6, 0, 16
-; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    v_ashrrev_i32_e32 v25, 16, v9
-; SI-NEXT:    v_bfe_i32 v24, v9, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v8
-; SI-NEXT:    v_bfe_i32 v8, v8, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v27, 16, v11
-; SI-NEXT:    v_bfe_i32 v26, v11, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v10
-; SI-NEXT:    v_bfe_i32 v10, v10, 0, 16
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_ashrrev_i32_e32 v29, 16, v13
-; SI-NEXT:    v_bfe_i32 v28, v13, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v12
-; SI-NEXT:    v_bfe_i32 v12, v12, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v31, 16, v15
-; SI-NEXT:    v_bfe_i32 v30, v15, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v14
-; SI-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v24, s1
+; SI-NEXT:    ds_read2_b64 v[0:3], v24 offset1:1
+; SI-NEXT:    ds_read2_b64 v[4:7], v24 offset0:2 offset1:3
 ; SI-NEXT:    v_mov_b32_e32 v32, s0
-; SI-NEXT:    ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15
-; SI-NEXT:    ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13
-; SI-NEXT:    ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11
-; SI-NEXT:    ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9
-; SI-NEXT:    ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7
-; SI-NEXT:    ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5
-; SI-NEXT:    ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3
-; SI-NEXT:    ds_write2_b64 v32, v[18:19], v[16:17] offset1:1
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v5
+; SI-NEXT:    v_bfe_i32 v8, v1, 0, 16
+; SI-NEXT:    v_bfe_i32 v10, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v12, v3, 0, 16
+; SI-NEXT:    v_bfe_i32 v14, v2, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v4
+; SI-NEXT:    v_ashrrev_i32_e32 v21, 16, v7
+; SI-NEXT:    v_bfe_i32 v16, v5, 0, 16
+; SI-NEXT:    v_bfe_i32 v18, v4, 0, 16
+; SI-NEXT:    ds_read2_b64 v[0:3], v24 offset0:4 offset1:5
+; SI-NEXT:    v_bfe_i32 v20, v7, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v23, 16, v6
+; SI-NEXT:    v_bfe_i32 v22, v6, 0, 16
+; SI-NEXT:    ds_read2_b64 v[4:7], v24 offset0:6 offset1:7
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_ashrrev_i32_e32 v25, 16, v1
+; SI-NEXT:    v_bfe_i32 v24, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v31, 16, v7
+; SI-NEXT:    v_bfe_i32 v30, v7, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v6
+; SI-NEXT:    v_bfe_i32 v6, v6, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v27, 16, v3
+; SI-NEXT:    v_bfe_i32 v26, v3, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v2
+; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v29, 16, v5
+; SI-NEXT:    v_bfe_i32 v28, v5, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v4
+; SI-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; SI-NEXT:    ds_write2_b64 v32, v[6:7], v[30:31] offset0:14 offset1:15
+; SI-NEXT:    ds_write2_b64 v32, v[4:5], v[28:29] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11
+; SI-NEXT:    ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v32, v[22:23], v[20:21] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v32, v[18:19], v[16:17] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v32, v[14:15], v[12:13] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v32, v[10:11], v[8:9] offset1:1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-NO-DS128-LABEL: local_sextload_v32i16_to_v32i32:
@@ -3054,119 +3054,118 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(ptr addrspace(3) %out
 define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 ; SI-LABEL: local_zextload_v64i16_to_v64i32:
 ; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; SI-NEXT:    s_mov_b32 s14, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v16, s1
+; SI-NEXT:    ds_read2_b64 v[10:13], v16 offset0:8 offset1:9
+; SI-NEXT:    ds_read2_b64 v[17:20], v16 offset0:10 offset1:11
+; SI-NEXT:    ds_read2_b64 v[21:24], v16 offset0:12 offset1:13
 ; SI-NEXT:    s_mov_b32 s15, 0xe8f000
 ; SI-NEXT:    s_add_u32 s12, s12, s11
+; SI-NEXT:    s_waitcnt lgkmcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v11
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v10
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v13
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v18
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v17
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v19
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v19
+; SI-NEXT:    ds_read2_b64 v[17:20], v16 offset0:14 offset1:15
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v21
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v24
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v23
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v18
+; SI-NEXT:    v_and_b32_e32 v33, 0xffff, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
+; SI-NEXT:    v_and_b32_e32 v35, 0xffff, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v20
+; SI-NEXT:    ds_read2_b64 v[21:24], v16 offset1:1
+; SI-NEXT:    v_and_b32_e32 v37, 0xffff, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v19
+; SI-NEXT:    v_and_b32_e32 v39, 0xffff, v19
+; SI-NEXT:    ds_read2_b64 v[17:20], v16 offset0:2 offset1:3
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v22
+; SI-NEXT:    v_and_b32_e32 v41, 0xffff, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v21
+; SI-NEXT:    v_and_b32_e32 v43, 0xffff, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v24
+; SI-NEXT:    v_and_b32_e32 v45, 0xffff, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v23
+; SI-NEXT:    v_and_b32_e32 v47, 0xffff, v23
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v18
+; SI-NEXT:    v_and_b32_e32 v49, 0xffff, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v17
+; SI-NEXT:    v_and_b32_e32 v51, 0xffff, v17
+; SI-NEXT:    ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v19
+; SI-NEXT:    v_and_b32_e32 v55, 0xffff, v19
+; SI-NEXT:    ds_read2_b64 v[16:19], v16 offset0:6 offset1:7
 ; SI-NEXT:    s_addc_u32 s13, s13, 0
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v20
+; SI-NEXT:    v_and_b32_e32 v53, 0xffff, v20
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v24, s1
-; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_read2_b64 v[0:3], v24 offset0:8 offset1:9
-; SI-NEXT:    ds_read2_b64 v[4:7], v24 offset0:10 offset1:11
-; SI-NEXT:    ds_read2_b64 v[12:15], v24 offset0:12 offset1:13
-; SI-NEXT:    ds_read2_b64 v[8:11], v24 offset0:14 offset1:15
-; SI-NEXT:    ds_read2_b64 v[20:23], v24 offset1:1
-; SI-NEXT:    ds_read2_b64 v[16:19], v24 offset0:2 offset1:3
-; SI-NEXT:    ds_read2_b64 v[34:37], v24 offset0:4 offset1:5
-; SI-NEXT:    ds_read2_b64 v[38:41], v24 offset0:6 offset1:7
-; SI-NEXT:    s_waitcnt lgkmcnt(7)
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
-; SI-NEXT:    s_waitcnt lgkmcnt(6)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v5
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v1
-; SI-NEXT:    buffer_store_dword v24, off, s[12:15], 0 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v0
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff, v3
-; SI-NEXT:    v_and_b32_e32 v30, 0xffff, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff, v5
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v4
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; SI-NEXT:    s_waitcnt lgkmcnt(5)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v13
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v15
-; SI-NEXT:    v_and_b32_e32 v42, 0xffff, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; SI-NEXT:    s_waitcnt lgkmcnt(4)
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v9
-; SI-NEXT:    v_and_b32_e32 v44, 0xffff, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v11
-; SI-NEXT:    v_and_b32_e32 v46, 0xffff, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    s_waitcnt lgkmcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v21
-; SI-NEXT:    v_and_b32_e32 v48, 0xffff, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v23
-; SI-NEXT:    v_and_b32_e32 v50, 0xffff, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; SI-NEXT:    s_waitcnt lgkmcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v17
-; SI-NEXT:    v_and_b32_e32 v52, 0xffff, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v18
+; SI-NEXT:    v_mov_b32_e32 v18, s0
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v22
+; SI-NEXT:    v_and_b32_e32 v57, 0xffff, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v24
+; SI-NEXT:    v_and_b32_e32 v59, 0xffff, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v17
+; SI-NEXT:    v_and_b32_e32 v61, 0xffff, v17
 ; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v19
-; SI-NEXT:    v_and_b32_e32 v54, 0xffff, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v35
-; SI-NEXT:    v_and_b32_e32 v56, 0xffff, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v34
-; SI-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v37
-; SI-NEXT:    v_and_b32_e32 v58, 0xffff, v37
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v36
-; SI-NEXT:    v_and_b32_e32 v36, 0xffff, v36
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v39
-; SI-NEXT:    v_and_b32_e32 v60, 0xffff, v39
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v38
-; SI-NEXT:    v_and_b32_e32 v38, 0xffff, v38
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v41
-; SI-NEXT:    v_and_b32_e32 v62, 0xffff, v41
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v40
-; SI-NEXT:    v_and_b32_e32 v40, 0xffff, v40
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    ds_write2_b64 v0, v[40:41], v[62:63] offset0:14 offset1:15
-; SI-NEXT:    ds_write2_b64 v0, v[38:39], v[60:61] offset0:12 offset1:13
-; SI-NEXT:    ds_write2_b64 v0, v[36:37], v[58:59] offset0:10 offset1:11
-; SI-NEXT:    ds_write2_b64 v0, v[34:35], v[56:57] offset0:8 offset1:9
-; SI-NEXT:    ds_write2_b64 v0, v[18:19], v[54:55] offset0:6 offset1:7
-; SI-NEXT:    ds_write2_b64 v0, v[16:17], v[52:53] offset0:4 offset1:5
-; SI-NEXT:    ds_write2_b64 v0, v[22:23], v[50:51] offset0:2 offset1:3
-; SI-NEXT:    ds_write2_b64 v0, v[20:21], v[48:49] offset1:1
-; SI-NEXT:    ds_write2_b64 v0, v[10:11], v[46:47] offset0:30 offset1:31
-; SI-NEXT:    ds_write2_b64 v0, v[8:9], v[44:45] offset0:28 offset1:29
-; SI-NEXT:    ds_write2_b64 v0, v[14:15], v[42:43] offset0:26 offset1:27
-; SI-NEXT:    ds_write2_b64 v0, v[12:13], v[6:7] offset0:24 offset1:25
-; SI-NEXT:    ds_write2_b64 v0, v[4:5], v[2:3] offset0:22 offset1:23
-; SI-NEXT:    ds_write2_b64 v0, v[24:25], v[32:33] offset0:20 offset1:21
-; SI-NEXT:    ds_write2_b64 v0, v[30:31], v[28:29] offset0:18 offset1:19
-; SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    ds_write2_b64 v18, v[0:1], v[19:20] offset0:14 offset1:15
+; SI-NEXT:    ds_write2_b64 v18, v[16:17], v[61:62] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v18, v[23:24], v[59:60] offset0:10 offset1:11
+; SI-NEXT:    ds_write2_b64 v18, v[21:22], v[57:58] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v18, v[55:56], v[53:54] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v18, v[51:52], v[49:50] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v18, v[47:48], v[45:46] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v18, v[43:44], v[41:42] offset1:1
+; SI-NEXT:    ds_write2_b64 v18, v[39:40], v[37:38] offset0:30 offset1:31
+; SI-NEXT:    ds_write2_b64 v18, v[35:36], v[33:34] offset0:28 offset1:29
+; SI-NEXT:    ds_write2_b64 v18, v[31:32], v[29:30] offset0:26 offset1:27
+; SI-NEXT:    ds_write2_b64 v18, v[27:28], v[25:26] offset0:24 offset1:25
+; SI-NEXT:    ds_write2_b64 v18, v[14:15], v[12:13] offset0:22 offset1:23
+; SI-NEXT:    ds_write2_b64 v18, v[10:11], v[8:9] offset0:20 offset1:21
+; SI-NEXT:    ds_write2_b64 v18, v[6:7], v[4:5] offset0:18 offset1:19
+; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    ds_write2_b64 v0, v[26:27], v[1:2] offset0:16 offset1:17
+; SI-NEXT:    ds_write2_b64 v18, v[2:3], v[0:1] offset0:16 offset1:17
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32:
@@ -4077,119 +4076,115 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 ; SI-LABEL: local_sextload_v64i16_to_v64i32:
 ; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; SI-NEXT:    s_mov_b32 s14, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v28, s1
+; SI-NEXT:    ds_read2_b64 v[29:32], v28 offset0:14 offset1:15
+; SI-NEXT:    ds_read2_b64 v[33:36], v28 offset1:1
+; SI-NEXT:    ds_read2_b64 v[10:13], v28 offset0:8 offset1:9
+; SI-NEXT:    ds_read2_b64 v[14:17], v28 offset0:10 offset1:11
+; SI-NEXT:    ds_read2_b64 v[20:23], v28 offset0:12 offset1:13
 ; SI-NEXT:    s_mov_b32 s15, 0xe8f000
+; SI-NEXT:    s_waitcnt lgkmcnt(4)
+; SI-NEXT:    v_ashrrev_i32_e32 v25, 16, v30
+; SI-NEXT:    v_bfe_i32 v24, v30, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v27, 16, v29
+; SI-NEXT:    v_bfe_i32 v26, v29, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v38, 16, v32
+; SI-NEXT:    v_bfe_i32 v37, v32, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v40, 16, v31
+; SI-NEXT:    v_bfe_i32 v39, v31, 0, 16
+; SI-NEXT:    ds_read2_b64 v[29:32], v28 offset0:2 offset1:3
+; SI-NEXT:    s_waitcnt lgkmcnt(4)
+; SI-NEXT:    v_ashrrev_i32_e32 v42, 16, v34
+; SI-NEXT:    v_bfe_i32 v41, v34, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v44, 16, v33
+; SI-NEXT:    v_bfe_i32 v43, v33, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v46, 16, v36
+; SI-NEXT:    v_bfe_i32 v45, v36, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v48, 16, v35
+; SI-NEXT:    v_bfe_i32 v47, v35, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v50, 16, v30
+; SI-NEXT:    v_bfe_i32 v49, v30, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v52, 16, v29
+; SI-NEXT:    v_bfe_i32 v51, v29, 0, 16
+; SI-NEXT:    ds_read2_b64 v[33:36], v28 offset0:4 offset1:5
+; SI-NEXT:    v_ashrrev_i32_e32 v56, 16, v31
+; SI-NEXT:    v_bfe_i32 v55, v31, 0, 16
+; SI-NEXT:    ds_read2_b64 v[28:31], v28 offset0:6 offset1:7
 ; SI-NEXT:    s_add_u32 s12, s12, s11
 ; SI-NEXT:    s_addc_u32 s13, s13, 0
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
+; SI-NEXT:    v_bfe_i32 v0, v11, 0, 16
+; SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    v_ashrrev_i32_e32 v54, 16, v32
+; SI-NEXT:    v_bfe_i32 v53, v32, 0, 16
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v20, s1
-; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_read2_b64 v[4:7], v20 offset0:8 offset1:9
-; SI-NEXT:    ds_read2_b64 v[0:3], v20 offset0:10 offset1:11
-; SI-NEXT:    ds_read2_b64 v[8:11], v20 offset0:12 offset1:13
-; SI-NEXT:    ds_read2_b64 v[12:15], v20 offset0:14 offset1:15
-; SI-NEXT:    ds_read2_b64 v[16:19], v20 offset1:1
-; SI-NEXT:    ds_read2_b64 v[30:33], v20 offset0:2 offset1:3
-; SI-NEXT:    ds_read2_b64 v[34:37], v20 offset0:4 offset1:5
-; SI-NEXT:    ds_read2_b64 v[38:41], v20 offset0:6 offset1:7
-; SI-NEXT:    s_waitcnt lgkmcnt(7)
-; SI-NEXT:    v_ashrrev_i32_e32 v21, 16, v5
-; SI-NEXT:    v_ashrrev_i32_e32 v23, 16, v4
-; SI-NEXT:    v_ashrrev_i32_e32 v25, 16, v7
-; SI-NEXT:    v_ashrrev_i32_e32 v27, 16, v6
-; SI-NEXT:    s_waitcnt lgkmcnt(6)
-; SI-NEXT:    v_ashrrev_i32_e32 v29, 16, v1
-; SI-NEXT:    v_bfe_i32 v20, v5, 0, 16
-; SI-NEXT:    buffer_store_dword v20, off, s[12:15], 0 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; SI-NEXT:    v_bfe_i32 v22, v4, 0, 16
-; SI-NEXT:    v_bfe_i32 v24, v7, 0, 16
-; SI-NEXT:    v_bfe_i32 v26, v6, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v32, 16, v31
+; SI-NEXT:    v_bfe_i32 v31, v31, 0, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_ashrrev_i32_e32 v21, 16, v0
-; SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v3
-; SI-NEXT:    v_bfe_i32 v28, v1, 0, 16
-; SI-NEXT:    v_bfe_i32 v20, v0, 0, 16
-; SI-NEXT:    v_bfe_i32 v6, v3, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v2
-; SI-NEXT:    v_bfe_i32 v4, v2, 0, 16
-; SI-NEXT:    s_waitcnt lgkmcnt(5)
-; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v9
-; SI-NEXT:    v_bfe_i32 v2, v9, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v8
-; SI-NEXT:    v_bfe_i32 v8, v8, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v43, 16, v11
-; SI-NEXT:    v_bfe_i32 v42, v11, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v10
-; SI-NEXT:    v_bfe_i32 v10, v10, 0, 16
-; SI-NEXT:    s_waitcnt lgkmcnt(4)
-; SI-NEXT:    v_ashrrev_i32_e32 v45, 16, v13
-; SI-NEXT:    v_bfe_i32 v44, v13, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v12
-; SI-NEXT:    v_bfe_i32 v12, v12, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v47, 16, v15
-; SI-NEXT:    v_bfe_i32 v46, v15, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v14
-; SI-NEXT:    v_bfe_i32 v14, v14, 0, 16
-; SI-NEXT:    s_waitcnt lgkmcnt(3)
-; SI-NEXT:    v_ashrrev_i32_e32 v49, 16, v17
-; SI-NEXT:    v_bfe_i32 v48, v17, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v16
-; SI-NEXT:    v_bfe_i32 v16, v16, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v51, 16, v19
-; SI-NEXT:    v_bfe_i32 v50, v19, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v18
-; SI-NEXT:    v_bfe_i32 v18, v18, 0, 16
-; SI-NEXT:    s_waitcnt lgkmcnt(2)
-; SI-NEXT:    v_ashrrev_i32_e32 v53, 16, v31
-; SI-NEXT:    v_bfe_i32 v52, v31, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v31, 16, v30
-; SI-NEXT:    v_bfe_i32 v30, v30, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v55, 16, v33
-; SI-NEXT:    v_bfe_i32 v54, v33, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v33, 16, v32
-; SI-NEXT:    v_bfe_i32 v32, v32, 0, 16
-; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    v_ashrrev_i32_e32 v57, 16, v35
-; SI-NEXT:    v_bfe_i32 v56, v35, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v35, 16, v34
-; SI-NEXT:    v_bfe_i32 v34, v34, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v59, 16, v37
-; SI-NEXT:    v_bfe_i32 v58, v37, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v37, 16, v36
-; SI-NEXT:    v_bfe_i32 v36, v36, 0, 16
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_ashrrev_i32_e32 v61, 16, v39
-; SI-NEXT:    v_bfe_i32 v60, v39, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v39, 16, v38
-; SI-NEXT:    v_bfe_i32 v38, v38, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v63, 16, v41
-; SI-NEXT:    v_bfe_i32 v62, v41, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v41, 16, v40
-; SI-NEXT:    v_bfe_i32 v40, v40, 0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    ds_write2_b64 v0, v[40:41], v[62:63] offset0:14 offset1:15
-; SI-NEXT:    ds_write2_b64 v0, v[38:39], v[60:61] offset0:12 offset1:13
-; SI-NEXT:    ds_write2_b64 v0, v[36:37], v[58:59] offset0:10 offset1:11
-; SI-NEXT:    ds_write2_b64 v0, v[34:35], v[56:57] offset0:8 offset1:9
-; SI-NEXT:    ds_write2_b64 v0, v[32:33], v[54:55] offset0:6 offset1:7
-; SI-NEXT:    ds_write2_b64 v0, v[30:31], v[52:53] offset0:4 offset1:5
-; SI-NEXT:    ds_write2_b64 v0, v[18:19], v[50:51] offset0:2 offset1:3
-; SI-NEXT:    ds_write2_b64 v0, v[16:17], v[48:49] offset1:1
-; SI-NEXT:    ds_write2_b64 v0, v[14:15], v[46:47] offset0:30 offset1:31
-; SI-NEXT:    ds_write2_b64 v0, v[12:13], v[44:45] offset0:28 offset1:29
-; SI-NEXT:    ds_write2_b64 v0, v[10:11], v[42:43] offset0:26 offset1:27
-; SI-NEXT:    ds_write2_b64 v0, v[8:9], v[2:3] offset0:24 offset1:25
-; SI-NEXT:    ds_write2_b64 v0, v[4:5], v[6:7] offset0:22 offset1:23
-; SI-NEXT:    ds_write2_b64 v0, v[20:21], v[28:29] offset0:20 offset1:21
-; SI-NEXT:    ds_write2_b64 v0, v[26:27], v[24:25] offset0:18 offset1:19
-; SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v30
+; SI-NEXT:    v_bfe_i32 v0, v30, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v30, s0
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
+; SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
+; SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
+; SI-NEXT:    v_bfe_i32 v2, v10, 0, 16
+; SI-NEXT:    v_bfe_i32 v4, v13, 0, 16
+; SI-NEXT:    v_bfe_i32 v6, v12, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v14
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v17
+; SI-NEXT:    v_bfe_i32 v8, v15, 0, 16
+; SI-NEXT:    v_bfe_i32 v10, v14, 0, 16
+; SI-NEXT:    v_bfe_i32 v12, v17, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v16
+; SI-NEXT:    v_bfe_i32 v14, v16, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v21
+; SI-NEXT:    v_bfe_i32 v16, v21, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v20
+; SI-NEXT:    v_bfe_i32 v18, v20, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v21, 16, v23
+; SI-NEXT:    v_bfe_i32 v20, v23, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v23, 16, v22
+; SI-NEXT:    v_bfe_i32 v22, v22, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v58, 16, v34
+; SI-NEXT:    v_bfe_i32 v57, v34, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v34, 16, v33
+; SI-NEXT:    v_bfe_i32 v33, v33, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v60, 16, v36
+; SI-NEXT:    v_bfe_i32 v59, v36, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v36, 16, v35
+; SI-NEXT:    v_bfe_i32 v35, v35, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v62, 16, v29
+; SI-NEXT:    v_bfe_i32 v61, v29, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v29, 16, v28
+; SI-NEXT:    v_bfe_i32 v28, v28, 0, 16
+; SI-NEXT:    ds_write2_b64 v30, v[0:1], v[31:32] offset0:14 offset1:15
+; SI-NEXT:    ds_write2_b64 v30, v[28:29], v[61:62] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v30, v[35:36], v[59:60] offset0:10 offset1:11
+; SI-NEXT:    ds_write2_b64 v30, v[33:34], v[57:58] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v30, v[55:56], v[53:54] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v30, v[51:52], v[49:50] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v30, v[47:48], v[45:46] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v30, v[43:44], v[41:42] offset1:1
+; SI-NEXT:    ds_write2_b64 v30, v[39:40], v[37:38] offset0:30 offset1:31
+; SI-NEXT:    ds_write2_b64 v30, v[26:27], v[24:25] offset0:28 offset1:29
+; SI-NEXT:    ds_write2_b64 v30, v[22:23], v[20:21] offset0:26 offset1:27
+; SI-NEXT:    ds_write2_b64 v30, v[18:19], v[16:17] offset0:24 offset1:25
+; SI-NEXT:    ds_write2_b64 v30, v[14:15], v[12:13] offset0:22 offset1:23
+; SI-NEXT:    ds_write2_b64 v30, v[10:11], v[8:9] offset0:20 offset1:21
+; SI-NEXT:    ds_write2_b64 v30, v[6:7], v[4:5] offset0:18 offset1:19
+; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    ds_write2_b64 v0, v[22:23], v[1:2] offset0:16 offset1:17
+; SI-NEXT:    ds_write2_b64 v30, v[2:3], v[0:1] offset0:16 offset1:17
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32:
@@ -5108,11 +5103,11 @@ define amdgpu_kernel void @local_zextload_i16_to_i64(ptr addrspace(3) %out, ptr
 ; SI-LABEL: local_zextload_i16_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_u16 v0, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    ds_write_b64 v2, v[0:1]
@@ -5172,13 +5167,13 @@ define amdgpu_kernel void @local_sextload_i16_to_i64(ptr addrspace(3) %out, ptr
 ; SI-LABEL: local_sextload_i16_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_i16 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    ds_write_b64 v2, v[0:1]
 ; SI-NEXT:    s_endpgm
 ;
@@ -5234,11 +5229,11 @@ define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(ptr addrspace(3) %out,
 ; SI-LABEL: local_zextload_v1i16_to_v1i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_u16 v0, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    ds_write_b64 v2, v[0:1]
@@ -5294,13 +5289,13 @@ define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(ptr addrspace(3) %out,
 ; SI-LABEL: local_sextload_v1i16_to_v1i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_i16 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    ds_write_b64 v2, v[0:1]
 ; SI-NEXT:    s_endpgm
 ;
@@ -5356,16 +5351,16 @@ define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(ptr addrspace(3) %out,
 ; SI-LABEL: local_zextload_v2i16_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_mov_b32_e32 v3, v1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_b32 v2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_mov_b32_e32 v4, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_mov_b32_e32 v4, s0
-; SI-NEXT:    v_mov_b32_e32 v3, v1
 ; SI-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
 ; SI-NEXT:    s_endpgm
 ;
@@ -5465,17 +5460,17 @@ define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(ptr addrspace(3) %out,
 ; SI-LABEL: local_sextload_v2i16_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_b32 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; SI-NEXT:    v_bfe_i32 v2, v1, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; SI-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
 ; SI-NEXT:    s_endpgm
 ;
@@ -5578,15 +5573,15 @@ define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(ptr addrspace(3) %out,
 ; SI-LABEL: local_zextload_v4i16_to_v4i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s1
 ; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_read_b64 v[0:1], v0
 ; SI-NEXT:    v_mov_b32_e32 v3, 0
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
 ; SI-NEXT:    v_mov_b32_e32 v10, s0
+; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
@@ -5730,16 +5725,16 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out,
 ; SI-LABEL: local_sextload_v4i16_to_v4i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_b64 v[0:1], v0
 ; SI-NEXT:    v_mov_b32_e32 v8, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; SI-NEXT:    v_bfe_i32 v4, v1, 0, 16
 ; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
 ; SI-NEXT:    v_ashrrev_i32_e32 v2, 16, v1
-; SI-NEXT:    v_bfe_i32 v4, v1, 0, 16
 ; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; SI-NEXT:    v_bfe_i32 v6, v5, 0, 16
 ; SI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
@@ -5893,32 +5888,32 @@ define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(ptr addrspace(3) %out,
 ; SI-LABEL: local_zextload_v8i16_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s1
 ; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
 ; SI-NEXT:    v_mov_b32_e32 v5, 0
 ; SI-NEXT:    v_mov_b32_e32 v7, v5
+; SI-NEXT:    v_mov_b32_e32 v12, v5
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v13, s0
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
-; SI-NEXT:    v_mov_b32_e32 v11, v5
-; SI-NEXT:    v_mov_b32_e32 v13, v5
-; SI-NEXT:    v_mov_b32_e32 v15, v5
-; SI-NEXT:    v_mov_b32_e32 v17, v5
-; SI-NEXT:    v_mov_b32_e32 v19, v5
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v0
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v1
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v2
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v3
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    ds_write2_b64 v0, v[8:9], v[6:7] offset0:6 offset1:7
-; SI-NEXT:    ds_write2_b64 v0, v[12:13], v[4:5] offset0:2 offset1:3
-; SI-NEXT:    ds_write2_b64 v0, v[10:11], v[16:17] offset0:4 offset1:5
-; SI-NEXT:    ds_write2_b64 v0, v[14:15], v[18:19] offset1:1
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    ds_write2_b64 v13, v[2:3], v[6:7] offset0:6 offset1:7
+; SI-NEXT:    v_mov_b32_e32 v2, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    ds_write2_b64 v13, v[1:2], v[4:5] offset0:2 offset1:3
+; SI-NEXT:    v_mov_b32_e32 v1, v5
+; SI-NEXT:    ds_write2_b64 v13, v[11:12], v[8:9] offset0:4 offset1:5
+; SI-NEXT:    v_mov_b32_e32 v11, v5
+; SI-NEXT:    ds_write2_b64 v13, v[0:1], v[10:11] offset1:1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-NO-DS128-LABEL: local_zextload_v8i16_to_v8i64:
@@ -6131,34 +6126,34 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out,
 ; SI-LABEL: local_sextload_v8i16_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
 ; SI-NEXT:    v_mov_b32_e32 v16, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
-; SI-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
-; SI-NEXT:    v_ashrrev_i32_e32 v4, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; SI-NEXT:    v_bfe_i32 v14, v2, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v3, 0, 16
 ; SI-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
 ; SI-NEXT:    v_ashrrev_i32_e32 v6, 16, v3
-; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; SI-NEXT:    v_bfe_i32 v8, v1, 0, 16
-; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; SI-NEXT:    v_bfe_i32 v10, v3, 0, 16
-; SI-NEXT:    v_bfe_i32 v12, v11, 0, 16
-; SI-NEXT:    v_bfe_i32 v14, v9, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; SI-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; SI-NEXT:    v_bfe_i32 v10, v9, 0, 16
+; SI-NEXT:    v_bfe_i32 v12, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v0, v1, 0, 16
 ; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v4, 16, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
 ; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
 ; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; SI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
-; SI-NEXT:    ds_write2_b64 v16, v[10:11], v[6:7] offset0:6 offset1:7
-; SI-NEXT:    ds_write2_b64 v16, v[8:9], v[4:5] offset0:2 offset1:3
-; SI-NEXT:    ds_write2_b64 v16, v[2:3], v[14:15] offset0:4 offset1:5
-; SI-NEXT:    ds_write2_b64 v16, v[0:1], v[12:13] offset1:1
+; SI-NEXT:    ds_write2_b64 v16, v[2:3], v[6:7] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v16, v[0:1], v[4:5] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v16, v[14:15], v[10:11] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v16, v[12:13], v[8:9] offset1:1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-NO-DS128-LABEL: local_sextload_v8i16_to_v8i64:
@@ -6394,50 +6389,48 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out
 ; SI-LABEL: local_zextload_v16i16_to_v16i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, s1
 ; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
 ; SI-NEXT:    v_mov_b32_e32 v9, 0
-; SI-NEXT:    ds_read2_b64 v[4:7], v4 offset1:1
-; SI-NEXT:    v_mov_b32_e32 v11, v9
-; SI-NEXT:    v_mov_b32_e32 v13, v9
 ; SI-NEXT:    v_mov_b32_e32 v15, v9
 ; SI-NEXT:    v_mov_b32_e32 v17, v9
-; SI-NEXT:    v_mov_b32_e32 v20, s0
-; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v1
-; SI-NEXT:    ds_write2_b64 v20, v[16:17], v[14:15] offset0:10 offset1:11
-; SI-NEXT:    v_mov_b32_e32 v16, v9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; SI-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; SI-NEXT:    v_mov_b32_e32 v18, s0
+; SI-NEXT:    v_mov_b32_e32 v13, v9
+; SI-NEXT:    v_mov_b32_e32 v11, v9
 ; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v3
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v3
-; SI-NEXT:    ds_write2_b64 v20, v[14:15], v[12:13] offset0:14 offset1:15
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v7
-; SI-NEXT:    ds_write2_b64 v20, v[15:16], v[10:11] offset0:6 offset1:7
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v4
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v2
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v0
-; SI-NEXT:    v_mov_b32_e32 v5, v9
-; SI-NEXT:    ds_write2_b64 v20, v[4:5], v[8:9] offset0:2 offset1:3
-; SI-NEXT:    v_mov_b32_e32 v19, v9
-; SI-NEXT:    v_mov_b32_e32 v8, v9
-; SI-NEXT:    v_mov_b32_e32 v15, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
+; SI-NEXT:    ds_write2_b64 v18, v[16:17], v[14:15] offset0:10 offset1:11
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; SI-NEXT:    ds_write2_b64 v18, v[14:15], v[12:13] offset0:14 offset1:15
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v2
 ; SI-NEXT:    v_mov_b32_e32 v2, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; SI-NEXT:    ds_write2_b64 v18, v[12:13], v[10:11] offset0:6 offset1:7
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v6
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v4
+; SI-NEXT:    ds_write2_b64 v18, v[1:2], v[8:9] offset0:2 offset1:3
 ; SI-NEXT:    v_mov_b32_e32 v4, v9
-; SI-NEXT:    ds_write2_b64 v20, v[18:19], v[12:13] offset0:8 offset1:9
-; SI-NEXT:    ds_write2_b64 v20, v[16:17], v[14:15] offset0:12 offset1:13
-; SI-NEXT:    ds_write2_b64 v20, v[10:11], v[1:2] offset0:4 offset1:5
-; SI-NEXT:    ds_write2_b64 v20, v[7:8], v[3:4] offset1:1
+; SI-NEXT:    v_mov_b32_e32 v1, v9
+; SI-NEXT:    v_mov_b32_e32 v6, v9
+; SI-NEXT:    v_mov_b32_e32 v8, v9
+; SI-NEXT:    ds_write2_b64 v18, v[16:17], v[10:11] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v18, v[14:15], v[12:13] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v18, v[3:4], v[5:6] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v18, v[0:1], v[7:8] offset1:1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-NO-DS128-LABEL: local_zextload_v16i16_to_v16i64:
@@ -6801,58 +6794,58 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out
 ; SI-LABEL: local_sextload_v16i16_to_v16i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v4, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
 ; SI-NEXT:    ds_read2_b64 v[4:7], v4 offset1:1
-; SI-NEXT:    v_mov_b32_e32 v18, s0
+; SI-NEXT:    v_mov_b32_e32 v19, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
-; SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v5
-; SI-NEXT:    v_ashrrev_i32_e32 v8, 16, v5
+; SI-NEXT:    v_bfe_i32 v12, v3, 0, 16
 ; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
 ; SI-NEXT:    v_ashrrev_i32_e32 v10, 16, v3
-; SI-NEXT:    v_bfe_i32 v12, v3, 0, 16
 ; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
-; SI-NEXT:    ds_write2_b64 v18, v[12:13], v[10:11] offset0:14 offset1:15
+; SI-NEXT:    ds_write2_b64 v19, v[12:13], v[10:11] offset0:14 offset1:15
+; SI-NEXT:    v_bfe_i32 v12, v1, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
 ; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v1
 ; SI-NEXT:    v_ashrrev_i32_e32 v10, 16, v1
-; SI-NEXT:    v_bfe_i32 v12, v1, 0, 16
 ; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
-; SI-NEXT:    ds_write2_b64 v18, v[12:13], v[10:11] offset0:10 offset1:11
+; SI-NEXT:    ds_write2_b64 v19, v[12:13], v[10:11] offset0:10 offset1:11
+; SI-NEXT:    v_bfe_i32 v12, v14, 0, 16
+; SI-NEXT:    v_bfe_i32 v14, v7, 0, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
 ; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v7
 ; SI-NEXT:    v_ashrrev_i32_e32 v10, 16, v7
-; SI-NEXT:    v_bfe_i32 v12, v7, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
-; SI-NEXT:    ds_write2_b64 v18, v[12:13], v[10:11] offset0:6 offset1:7
-; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
-; SI-NEXT:    v_bfe_i32 v3, v5, 0, 16
-; SI-NEXT:    v_bfe_i32 v5, v6, 0, 16
-; SI-NEXT:    v_bfe_i32 v7, v0, 0, 16
-; SI-NEXT:    v_bfe_i32 v10, v2, 0, 16
-; SI-NEXT:    v_bfe_i32 v12, v17, 0, 16
-; SI-NEXT:    v_bfe_i32 v14, v14, 0, 16
-; SI-NEXT:    v_bfe_i32 v16, v16, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; SI-NEXT:    ds_write2_b64 v18, v[3:4], v[8:9] offset0:2 offset1:3
-; SI-NEXT:    v_bfe_i32 v3, v15, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
+; SI-NEXT:    ds_write2_b64 v19, v[14:15], v[10:11] offset0:6 offset1:7
+; SI-NEXT:    v_bfe_i32 v14, v16, 0, 16
+; SI-NEXT:    v_bfe_i32 v16, v5, 0, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v2
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v8, 16, v5
+; SI-NEXT:    v_bfe_i32 v10, v17, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; SI-NEXT:    ds_write2_b64 v19, v[16:17], v[8:9] offset0:2 offset1:3
+; SI-NEXT:    v_bfe_i32 v7, v18, 0, 16
+; SI-NEXT:    v_bfe_i32 v16, v2, 0, 16
 ; SI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
-; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; SI-NEXT:    v_bfe_i32 v3, v4, 0, 16
+; SI-NEXT:    v_bfe_i32 v5, v6, 0, 16
+; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
 ; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
 ; SI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
-; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
 ; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; SI-NEXT:    ds_write2_b64 v18, v[10:11], v[3:4] offset0:12 offset1:13
-; SI-NEXT:    ds_write2_b64 v18, v[7:8], v[16:17] offset0:8 offset1:9
-; SI-NEXT:    ds_write2_b64 v18, v[5:6], v[14:15] offset0:4 offset1:5
-; SI-NEXT:    ds_write2_b64 v18, v[1:2], v[12:13] offset1:1
+; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT:    ds_write2_b64 v19, v[16:17], v[7:8] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v19, v[0:1], v[14:15] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v19, v[5:6], v[10:11] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v19, v[3:4], v[12:13] offset1:1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-NO-DS128-LABEL: local_sextload_v16i16_to_v16i64:
@@ -7277,80 +7270,78 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out
 ; SI-LABEL: local_zextload_v32i16_to_v32i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s1
 ; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_read2_b64 v[2:5], v0 offset0:2 offset1:3
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    ds_read2_b64 v[6:9], v0 offset1:1
-; SI-NEXT:    v_mov_b32_e32 v19, v1
-; SI-NEXT:    v_mov_b32_e32 v21, v1
-; SI-NEXT:    v_mov_b32_e32 v22, s0
-; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v5
-; SI-NEXT:    ds_read2_b64 v[10:13], v0 offset0:4 offset1:5
-; SI-NEXT:    ds_read2_b64 v[14:17], v0 offset0:6 offset1:7
-; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:14 offset1:15
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v3
-; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:10 offset1:11
-; SI-NEXT:    s_waitcnt lgkmcnt(4)
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v9
-; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:6 offset1:7
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v7
-; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:2 offset1:3
+; SI-NEXT:    v_mov_b32_e32 v4, 0
+; SI-NEXT:    v_mov_b32_e32 v17, v4
+; SI-NEXT:    v_mov_b32_e32 v19, v4
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v12, s1
+; SI-NEXT:    ds_read2_b64 v[0:3], v12 offset1:1
+; SI-NEXT:    ds_read2_b64 v[5:8], v12 offset0:2 offset1:3
+; SI-NEXT:    v_mov_b32_e32 v20, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v8
+; SI-NEXT:    ds_read2_b64 v[8:11], v12 offset0:4 offset1:5
+; SI-NEXT:    ds_read2_b64 v[12:15], v12 offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v20, v[18:19], v[16:17] offset0:14 offset1:15
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v6
+; SI-NEXT:    ds_write2_b64 v20, v[18:19], v[16:17] offset0:10 offset1:11
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v3
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v3
+; SI-NEXT:    ds_write2_b64 v20, v[18:19], v[16:17] offset0:6 offset1:7
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v1
+; SI-NEXT:    ds_write2_b64 v20, v[18:19], v[16:17] offset0:2 offset1:3
 ; SI-NEXT:    s_waitcnt lgkmcnt(4)
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v17
-; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:30 offset1:31
-; SI-NEXT:    v_mov_b32_e32 v18, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
-; SI-NEXT:    v_mov_b32_e32 v20, v1
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v15
-; SI-NEXT:    ds_write2_b64 v22, v[19:20], v[17:18] offset0:26 offset1:27
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; SI-NEXT:    ds_write2_b64 v22, v[19:20], v[17:18] offset0:22 offset1:23
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    ds_write2_b64 v22, v[4:5], v[17:18] offset0:12 offset1:13
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v2
-; SI-NEXT:    v_mov_b32_e32 v4, v1
-; SI-NEXT:    ds_write2_b64 v22, v[17:18], v[3:4] offset0:8 offset1:9
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; SI-NEXT:    ds_write2_b64 v22, v[8:9], v[2:3] offset0:4 offset1:5
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v10
-; SI-NEXT:    ds_write2_b64 v22, v[6:7], v[4:5] offset1:1
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v10
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v11
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v15
+; SI-NEXT:    ds_write2_b64 v20, v[18:19], v[16:17] offset0:30 offset1:31
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
+; SI-NEXT:    v_mov_b32_e32 v16, v4
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v13
+; SI-NEXT:    v_mov_b32_e32 v18, v4
+; SI-NEXT:    ds_write2_b64 v20, v[17:18], v[15:16] offset0:26 offset1:27
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v11
+; SI-NEXT:    ds_write2_b64 v20, v[17:18], v[15:16] offset0:22 offset1:23
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; SI-NEXT:    v_mov_b32_e32 v7, v4
+; SI-NEXT:    ds_write2_b64 v20, v[15:16], v[6:7] offset0:12 offset1:13
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v5
+; SI-NEXT:    ds_write2_b64 v20, v[15:16], v[6:7] offset0:8 offset1:9
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v2
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    ds_write2_b64 v20, v[5:6], v[1:2] offset0:4 offset1:5
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
+; SI-NEXT:    ds_write2_b64 v20, v[6:7], v[1:2] offset1:1
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v8
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v12
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v14
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v16
-; SI-NEXT:    v_mov_b32_e32 v6, v1
-; SI-NEXT:    ds_write2_b64 v22, v[5:6], v[0:1] offset0:18 offset1:19
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v13, v1
-; SI-NEXT:    v_mov_b32_e32 v16, v1
-; SI-NEXT:    ds_write2_b64 v22, v[19:20], v[12:13] offset0:28 offset1:29
-; SI-NEXT:    ds_write2_b64 v22, v[17:18], v[15:16] offset0:24 offset1:25
-; SI-NEXT:    ds_write2_b64 v22, v[10:11], v[2:3] offset0:20 offset1:21
-; SI-NEXT:    ds_write2_b64 v22, v[4:5], v[8:9] offset0:16 offset1:17
+; SI-NEXT:    ds_write2_b64 v20, v[6:7], v[3:4] offset0:18 offset1:19
+; SI-NEXT:    v_mov_b32_e32 v9, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v4
+; SI-NEXT:    v_mov_b32_e32 v11, v4
+; SI-NEXT:    v_mov_b32_e32 v14, v4
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v1, v4
+; SI-NEXT:    ds_write2_b64 v20, v[17:18], v[10:11] offset0:28 offset1:29
+; SI-NEXT:    ds_write2_b64 v20, v[15:16], v[13:14] offset0:24 offset1:25
+; SI-NEXT:    ds_write2_b64 v20, v[8:9], v[5:6] offset0:20 offset1:21
+; SI-NEXT:    ds_write2_b64 v20, v[2:3], v[0:1] offset0:16 offset1:17
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-NO-DS128-LABEL: local_zextload_v32i16_to_v32i64:
@@ -8009,106 +8000,106 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out
 ; SI-LABEL: local_sextload_v32i16_to_v32i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v12, s1
 ; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
-; SI-NEXT:    ds_read2_b64 v[0:3], v12 offset1:1
-; SI-NEXT:    ds_read2_b64 v[8:11], v12 offset0:6 offset1:7
-; SI-NEXT:    ds_read2_b64 v[12:15], v12 offset0:4 offset1:5
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v5, s1
+; SI-NEXT:    ds_read2_b64 v[0:3], v5 offset0:2 offset1:3
+; SI-NEXT:    ds_read2_b64 v[6:9], v5 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    ds_read2_b64 v[10:13], v5 offset0:6 offset1:7
+; SI-NEXT:    ds_read2_b64 v[14:17], v5 offset0:4 offset1:5
 ; SI-NEXT:    s_waitcnt lgkmcnt(3)
-; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v7
-; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v7
-; SI-NEXT:    v_bfe_i32 v18, v7, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; SI-NEXT:    v_mov_b32_e32 v7, s0
-; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:14 offset1:15
-; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v5
-; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v5
-; SI-NEXT:    v_bfe_i32 v18, v5, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:10 offset1:11
+; SI-NEXT:    v_bfe_i32 v20, v3, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v18, 16, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; SI-NEXT:    ds_write2_b64 v4, v[20:21], v[18:19] offset0:14 offset1:15
+; SI-NEXT:    v_bfe_i32 v20, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v18, 16, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; SI-NEXT:    ds_write2_b64 v4, v[20:21], v[18:19] offset0:10 offset1:11
 ; SI-NEXT:    s_waitcnt lgkmcnt(4)
-; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v3
-; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v3
-; SI-NEXT:    v_bfe_i32 v18, v3, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:6 offset1:7
-; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v1
-; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v1
-; SI-NEXT:    v_bfe_i32 v18, v1, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:2 offset1:3
+; SI-NEXT:    v_bfe_i32 v20, v9, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v9
+; SI-NEXT:    v_ashrrev_i32_e32 v18, 16, v9
+; SI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; SI-NEXT:    ds_write2_b64 v4, v[20:21], v[18:19] offset0:6 offset1:7
+; SI-NEXT:    v_bfe_i32 v20, v7, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v7
+; SI-NEXT:    v_ashrrev_i32_e32 v18, 16, v7
+; SI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; SI-NEXT:    ds_write2_b64 v4, v[20:21], v[18:19] offset0:2 offset1:3
 ; SI-NEXT:    s_waitcnt lgkmcnt(5)
-; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v11
-; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v11
-; SI-NEXT:    v_bfe_i32 v18, v11, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:30 offset1:31
-; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v9
-; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v9
-; SI-NEXT:    v_bfe_i32 v18, v9, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:26 offset1:27
+; SI-NEXT:    v_bfe_i32 v20, v13, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v13
+; SI-NEXT:    v_ashrrev_i32_e32 v18, 16, v13
+; SI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; SI-NEXT:    ds_write2_b64 v4, v[20:21], v[18:19] offset0:30 offset1:31
+; SI-NEXT:    v_bfe_i32 v20, v11, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v11
+; SI-NEXT:    v_ashrrev_i32_e32 v18, 16, v11
+; SI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; SI-NEXT:    ds_write2_b64 v4, v[20:21], v[18:19] offset0:26 offset1:27
 ; SI-NEXT:    s_waitcnt lgkmcnt(6)
-; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v15
-; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v15
-; SI-NEXT:    v_bfe_i32 v18, v15, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:22 offset1:23
-; SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v13
-; SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v13
-; SI-NEXT:    v_bfe_i32 v17, v13, 0, 16
+; SI-NEXT:    v_bfe_i32 v20, v17, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v17
+; SI-NEXT:    v_ashrrev_i32_e32 v18, 16, v17
+; SI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; SI-NEXT:    ds_write2_b64 v4, v[20:21], v[18:19] offset0:22 offset1:23
+; SI-NEXT:    v_bfe_i32 v19, v15, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v15
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v15
+; SI-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; SI-NEXT:    ds_write2_b64 v4, v[19:20], v[17:18] offset0:18 offset1:19
+; SI-NEXT:    v_bfe_i32 v17, v1, 0, 16
+; SI-NEXT:    v_bfe_i32 v1, v2, 0, 16
 ; SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
-; SI-NEXT:    ds_write2_b64 v7, v[17:18], v[15:16] offset0:18 offset1:19
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; SI-NEXT:    ds_write2_b64 v4, v[1:2], v[17:18] offset0:12 offset1:13
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset0:8 offset1:9
+; SI-NEXT:    v_bfe_i32 v0, v5, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v8, 0, 16
 ; SI-NEXT:    v_bfe_i32 v5, v6, 0, 16
+; SI-NEXT:    v_bfe_i32 v17, v9, 0, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v12
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
 ; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; SI-NEXT:    v_bfe_i32 v15, v1, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; SI-NEXT:    ds_write2_b64 v7, v[5:6], v[15:16] offset0:12 offset1:13
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; SI-NEXT:    v_bfe_i32 v3, v4, 0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v8
-; SI-NEXT:    v_bfe_i32 v5, v1, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; SI-NEXT:    ds_write2_b64 v7, v[3:4], v[5:6] offset0:8 offset1:9
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v12
-; SI-NEXT:    v_bfe_i32 v1, v12, 0, 16
-; SI-NEXT:    v_bfe_i32 v3, v14, 0, 16
-; SI-NEXT:    v_bfe_i32 v5, v8, 0, 16
-; SI-NEXT:    v_bfe_i32 v8, v10, 0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
-; SI-NEXT:    v_bfe_i32 v9, v0, 0, 16
-; SI-NEXT:    v_bfe_i32 v10, v2, 0, 16
-; SI-NEXT:    v_bfe_i32 v12, v11, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
-; SI-NEXT:    ds_write2_b64 v7, v[10:11], v[12:13] offset0:4 offset1:5
-; SI-NEXT:    v_bfe_i32 v11, v6, 0, 16
-; SI-NEXT:    v_bfe_i32 v13, v4, 0, 16
-; SI-NEXT:    v_bfe_i32 v15, v15, 0, 16
-; SI-NEXT:    v_bfe_i32 v16, v14, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
-; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; SI-NEXT:    ds_write2_b64 v7, v[9:10], v[16:17] offset1:1
-; SI-NEXT:    v_bfe_i32 v17, v18, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
+; SI-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset0:4 offset1:5
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
+; SI-NEXT:    ds_write2_b64 v4, v[5:6], v[17:18] offset1:1
+; SI-NEXT:    v_bfe_i32 v5, v11, 0, 16
+; SI-NEXT:    v_bfe_i32 v11, v12, 0, 16
+; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT:    v_bfe_i32 v7, v7, 0, 16
 ; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; SI-NEXT:    v_bfe_i32 v13, v14, 0, 16
+; SI-NEXT:    v_bfe_i32 v15, v16, 0, 16
+; SI-NEXT:    v_bfe_i32 v9, v10, 0, 16
 ; SI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
 ; SI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
 ; SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
-; SI-NEXT:    ds_write2_b64 v7, v[8:9], v[17:18] offset0:28 offset1:29
-; SI-NEXT:    ds_write2_b64 v7, v[5:6], v[15:16] offset0:24 offset1:25
-; SI-NEXT:    ds_write2_b64 v7, v[3:4], v[13:14] offset0:20 offset1:21
-; SI-NEXT:    ds_write2_b64 v7, v[1:2], v[11:12] offset0:16 offset1:17
+; SI-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; SI-NEXT:    ds_write2_b64 v4, v[11:12], v[5:6] offset0:28 offset1:29
+; SI-NEXT:    ds_write2_b64 v4, v[9:10], v[7:8] offset0:24 offset1:25
+; SI-NEXT:    ds_write2_b64 v4, v[15:16], v[2:3] offset0:20 offset1:21
+; SI-NEXT:    ds_write2_b64 v4, v[13:14], v[0:1] offset0:16 offset1:17
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64:
@@ -8927,9 +8918,9 @@ define amdgpu_kernel void @local_v8i16_to_128(ptr addrspace(3) %out, ptr addrspa
 ; SI-LABEL: local_v8i16_to_128:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
 ; SI-NEXT:    v_mov_b32_e32 v4, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll
index 4d751f2605c39..7c6000d092768 100644
--- a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll
+++ b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; OBJ:       Relocations [
 ; OBJ-NEXT: ]
@@ -291,7 +291,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
 ; GCN-NEXT:    v_nop_e64
 ; GCN-NEXT:    v_nop_e64
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_mov_b64 vcc, exec
 ; GCN-NEXT:    s_cbranch_execnz .LBB5_5
 ; GCN-NEXT:  ; %bb.9: ; %bb3
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll
index 3af1341e12c51..bcf9e77a15877 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_break.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn -disable-block-placement < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -disable-block-placement < %s | FileCheck -check-prefix=GCN %s
 
 ; Uses llvm.amdgcn.break
 
@@ -35,11 +35,11 @@ define amdgpu_kernel void @break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    s_load_dword s3, s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GCN-NEXT:    ; implicit-def: $sgpr6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GCN-NEXT:    ; implicit-def: $sgpr6
 ; GCN-NEXT:  .LBB0_1: ; %bb1
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    s_add_i32 s6, s6, 1
@@ -50,8 +50,8 @@ define amdgpu_kernel void @break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    ; in Loop: Header=BB0_1 Depth=1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GCN-NEXT:  .LBB0_3: ; %Flow
@@ -115,10 +115,10 @@ define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    s_load_dword s3, s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    ; implicit-def: $sgpr6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    ; implicit-def: $sgpr6
 ; GCN-NEXT:  .LBB1_1: ; %bb1
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    s_cmp_gt_i32 s6, -1
@@ -205,11 +205,11 @@ define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    s_load_dword s3, s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GCN-NEXT:    ; implicit-def: $sgpr6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GCN-NEXT:    ; implicit-def: $sgpr6
 ; GCN-NEXT:  .LBB2_1: ; %bb1
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], exec
@@ -219,8 +219,8 @@ define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    ; in Loop: Header=BB2_1 Depth=1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GCN-NEXT:  .LBB2_3: ; %Flow
@@ -296,11 +296,11 @@ define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    s_load_dword s3, s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GCN-NEXT:    ; implicit-def: $sgpr6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GCN-NEXT:    ; implicit-def: $sgpr6
 ; GCN-NEXT:  .LBB3_1: ; %bb1
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], exec
@@ -310,8 +310,8 @@ define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    ; in Loop: Header=BB3_1 Depth=1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GCN-NEXT:  .LBB3_3: ; %Flow
@@ -386,11 +386,11 @@ define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    s_load_dword s3, s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GCN-NEXT:    ; implicit-def: $sgpr6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GCN-NEXT:    ; implicit-def: $sgpr6
 ; GCN-NEXT:  .LBB4_1: ; %bb1
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
@@ -400,8 +400,8 @@ define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    ; in Loop: Header=BB4_1 Depth=1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GCN-NEXT:  .LBB4_3: ; %Flow
@@ -480,11 +480,11 @@ define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    s_load_dword s3, s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GCN-NEXT:    ; implicit-def: $sgpr6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GCN-NEXT:    ; implicit-def: $sgpr6
 ; GCN-NEXT:  .LBB5_1: ; %bb1
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], exec
@@ -494,8 +494,8 @@ define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    ; in Loop: Header=BB5_1 Depth=1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GCN-NEXT:  .LBB5_3: ; %Flow
diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir
index 02e3d7e81fd40..54b969bcf252d 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir
+++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -start-before=livevars -stop-after=twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -start-before=livevars -stop-after=twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck %s
 
 # FIXME: update_mir_test_checks tries to incorrectly re-use a variable
 # name used for a copy, so some of the check variable names were
diff --git a/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir b/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir
index 9465cc0320906..2cd418f5d7d79 100644
--- a/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir
+++ b/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -mtriple=amdgcn -run-pass=early-machinelicm -simplify-mir -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -passes=early-machinelicm -simplify-mir -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=early-machinelicm -simplify-mir -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -passes=early-machinelicm -simplify-mir -o - %s | FileCheck %s
 
 # Test to check machine LICM does not hoist convergent instructions,
 # DS_PERMUTE_B32 in this example.
diff --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
index 05bd3ac93d608..3aa13e486da68 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefixes=EG
 ; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s --check-prefixes=CM
-; RUN: llc < %s -mtriple=amdgcn | FileCheck %s --check-prefixes=GCN
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx600 | FileCheck %s --check-prefixes=GCN
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefixes=GFX8,SI
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global | FileCheck %s --check-prefixes=GFX8,VI
 
@@ -46,12 +46,12 @@ define amdgpu_kernel void @u32_mad24(ptr addrspace(1) %out, i32 %a, i32 %b, i32
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s0, s0, 0xffffff
 ; GCN-NEXT:    s_and_b32 s1, s1, 0xffffff
 ; GCN-NEXT:    s_mul_i32 s0, s0, s1
 ; GCN-NEXT:    s_add_i32 s0, s0, s2
-; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -135,11 +135,11 @@ define amdgpu_kernel void @i16_mad24(ptr addrspace(1) %out, i16 %a, i16 %b, i16
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    s_lshr_b32 s6, s4, 16
+; GCN-NEXT:    s_mul_i32 s4, s4, s6
+; GCN-NEXT:    s_add_i32 s4, s4, s5
+; GCN-NEXT:    s_sext_i32_i16 s4, s4
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_lshr_b32 s2, s4, 16
-; GCN-NEXT:    s_mul_i32 s2, s4, s2
-; GCN-NEXT:    s_add_i32 s2, s2, s5
-; GCN-NEXT:    s_sext_i32_i16 s4, s2
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -218,16 +218,16 @@ define amdgpu_kernel void @i8_mad24(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c)
 ;
 ; GCN-LABEL: i8_mad24:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s2, s[4:5], 0xb
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s4, s2, 8
-; GCN-NEXT:    s_lshr_b32 s5, s2, 16
-; GCN-NEXT:    s_mul_i32 s2, s2, s4
-; GCN-NEXT:    s_add_i32 s2, s2, s5
-; GCN-NEXT:    s_sext_i32_i8 s4, s2
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s4, s6, 8
+; GCN-NEXT:    s_lshr_b32 s5, s6, 16
+; GCN-NEXT:    s_mul_i32 s4, s6, s4
+; GCN-NEXT:    s_add_i32 s4, s4, s5
+; GCN-NEXT:    s_sext_i32_i8 s4, s4
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -300,17 +300,17 @@ define amdgpu_kernel void @i24_i32_i32_mad(ptr addrspace(1) %out, i32 %a, i32 %b
 ;
 ; GCN-LABEL: i24_i32_i32_mad:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s2, s[4:5], 0xb
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0xb
 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s2, s2, 8
+; GCN-NEXT:    s_ashr_i32 s4, s8, 8
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s2, s2, 34
-; GCN-NEXT:    s_mul_i32 s2, s2, s6
-; GCN-NEXT:    s_add_i32 s4, s2, s7
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_cselect_b32 s4, s4, 34
+; GCN-NEXT:    s_mul_i32 s4, s4, s6
+; GCN-NEXT:    s_add_i32 s4, s4, s7
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -765,32 +765,33 @@ define amdgpu_kernel void @i8_mad_sat_16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GCN-LABEL: i8_mad_sat_16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GCN-NEXT:    s_load_dword s0, s[4:5], 0x11
 ; GCN-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
 ; GCN-NEXT:    s_mov_b32 s22, -1
 ; GCN-NEXT:    s_mov_b32 s23, 0xe8f000
 ; GCN-NEXT:    s_add_u32 s20, s20, s11
 ; GCN-NEXT:    s_addc_u32 s21, s21, 0
-; GCN-NEXT:    s_load_dword s8, s[4:5], 0x11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_add_i32 s9, s8, 4
-; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    s_add_i32 s1, s0, 4
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    buffer_load_dword v1, v1, s[20:23], 0 offen
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[20:23], 0 offen
+; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s11, 0xf000
 ; GCN-NEXT:    s_mov_b32 s10, 0
 ; GCN-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GCN-NEXT:    s_mov_b64 s[18:19], s[10:11]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GCN-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; GCN-NEXT:    s_movk_i32 s2, 0xff80
+; GCN-NEXT:    v_mov_b32_e32 v5, 0x7f
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_load_sbyte v2, v[0:1], s[12:15], 0 addr64
 ; GCN-NEXT:    buffer_load_sbyte v3, v[0:1], s[8:11], 0 addr64
 ; GCN-NEXT:    buffer_load_sbyte v4, v[0:1], s[16:19], 0 addr64
-; GCN-NEXT:    s_movk_i32 s2, 0xff80
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
@@ -798,8 +799,7 @@ define amdgpu_kernel void @i8_mad_sat_16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mad_u32_u24 v2, v2, v3, v4
 ; GCN-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GCN-NEXT:    v_mov_b32_e32 v3, 0x7f
-; GCN-NEXT:    v_med3_i32 v2, v2, s2, v3
+; GCN-NEXT:    v_med3_i32 v2, v2, s2, v5
 ; GCN-NEXT:    s_mov_b64 s[2:3], s[10:11]
 ; GCN-NEXT:    buffer_store_byte v2, v[0:1], s[0:3], 0 addr64
 ; GCN-NEXT:    s_endpgm
@@ -987,35 +987,35 @@ define amdgpu_kernel void @i8_mad_32(ptr addrspace(1) %out, ptr addrspace(1) %a,
 ; GCN-LABEL: i8_mad_32:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
+; GCN-NEXT:    s_load_dword s0, s[4:5], 0x11
 ; GCN-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; GCN-NEXT:    s_mov_b32 s26, -1
 ; GCN-NEXT:    s_mov_b32 s27, 0xe8f000
 ; GCN-NEXT:    s_add_u32 s24, s24, s11
 ; GCN-NEXT:    s_addc_u32 s25, s25, 0
-; GCN-NEXT:    s_load_dword s8, s[4:5], 0x11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_add_i32 s9, s8, 4
-; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    s_add_i32 s1, s0, 4
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    buffer_load_dword v1, v1, s[24:27], 0 offen
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[24:27], 0 offen
+; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s11, 0xf000
 ; GCN-NEXT:    s_mov_b32 s14, 0
 ; GCN-NEXT:    s_mov_b32 s15, s11
 ; GCN-NEXT:    s_mov_b64 s[18:19], s[14:15]
-; GCN-NEXT:    s_mov_b64 s[22:23], s[14:15]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[22:23], s[14:15]
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[4:5]
 ; GCN-NEXT:    s_mov_b64 s[20:21], s[6:7]
+; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_mov_b32 s8, s0
+; GCN-NEXT:    s_mov_b32 s9, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_load_sbyte v2, v[0:1], s[12:15], 0 addr64
 ; GCN-NEXT:    buffer_load_sbyte v3, v[0:1], s[16:19], 0 addr64
 ; GCN-NEXT:    buffer_load_sbyte v0, v[0:1], s[20:23], 0 addr64
-; GCN-NEXT:    s_mov_b32 s10, -1
-; GCN-NEXT:    s_mov_b32 s8, s0
-; GCN-NEXT:    s_mov_b32 s9, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
@@ -1203,35 +1203,35 @@ define amdgpu_kernel void @i8_mad_64(ptr addrspace(1) %out, ptr addrspace(1) %a,
 ; GCN-LABEL: i8_mad_64:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
+; GCN-NEXT:    s_load_dword s0, s[4:5], 0x11
 ; GCN-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; GCN-NEXT:    s_mov_b32 s26, -1
 ; GCN-NEXT:    s_mov_b32 s27, 0xe8f000
 ; GCN-NEXT:    s_add_u32 s24, s24, s11
 ; GCN-NEXT:    s_addc_u32 s25, s25, 0
-; GCN-NEXT:    s_load_dword s8, s[4:5], 0x11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_add_i32 s9, s8, 4
-; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    s_add_i32 s1, s0, 4
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    buffer_load_dword v1, v1, s[24:27], 0 offen
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[24:27], 0 offen
+; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s11, 0xf000
 ; GCN-NEXT:    s_mov_b32 s14, 0
 ; GCN-NEXT:    s_mov_b32 s15, s11
 ; GCN-NEXT:    s_mov_b64 s[18:19], s[14:15]
-; GCN-NEXT:    s_mov_b64 s[22:23], s[14:15]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[22:23], s[14:15]
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[4:5]
 ; GCN-NEXT:    s_mov_b64 s[20:21], s[6:7]
+; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_mov_b32 s8, s0
+; GCN-NEXT:    s_mov_b32 s9, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_load_sbyte v2, v[0:1], s[12:15], 0 addr64
 ; GCN-NEXT:    buffer_load_sbyte v3, v[0:1], s[16:19], 0 addr64
 ; GCN-NEXT:    buffer_load_sbyte v0, v[0:1], s[20:23], 0 addr64
-; GCN-NEXT:    s_mov_b32 s10, -1
-; GCN-NEXT:    s_mov_b32 s8, s0
-; GCN-NEXT:    s_mov_b32 s9, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
@@ -1512,31 +1512,31 @@ define void @mad24_known_bits_destroyed(i32 %arg, <4 x i32> %arg1, <4 x i32> %ar
 ; GCN-LABEL: mad24_known_bits_destroyed:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    v_mov_b32_e32 v5, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v13
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffffff, v4
 ; GCN-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
 ; GCN-NEXT:  .LBB9_1: ; %bb19
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_add_i32_e32 v15, vcc, -1, v15
 ; GCN-NEXT:    v_mad_u32_u24 v4, v5, v0, v14
 ; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    v_mad_u32_u24 v6, v6, v1, v10
 ; GCN-NEXT:    v_mad_u32_u24 v7, v7, v2, v11
 ; GCN-NEXT:    v_mad_u32_u24 v8, v8, v3, v12
-; GCN-NEXT:    v_add_i32_e32 v15, vcc, -1, v15
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
 ; GCN-NEXT:    v_mad_u32_u24 v5, v4, v0, v14
 ; GCN-NEXT:    v_mad_u32_u24 v6, v6, v1, v10
 ; GCN-NEXT:    v_mad_u32_u24 v7, v7, v2, v11
 ; GCN-NEXT:    v_mad_u32_u24 v8, v8, v3, v12
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
-; GCN-NEXT:    buffer_store_dword v5, v[16:17], s[4:7], 0 addr64
 ; GCN-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-NEXT:    buffer_store_dword v5, v[16:17], s[4:7], 0 addr64
 ; GCN-NEXT:    buffer_store_dwordx4 v[5:8], v[18:19], s[4:7], 0 addr64
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_execnz .LBB9_1
diff --git a/llvm/test/CodeGen/AMDGPU/mcp-use-before-def.mir b/llvm/test/CodeGen/AMDGPU/mcp-use-before-def.mir
index 8ca35d1dd53a3..d65a3b888a675 100644
--- a/llvm/test/CodeGen/AMDGPU/mcp-use-before-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/mcp-use-before-def.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs  -run-pass=machine-cp -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs  -run-pass=machine-cp -o - %s | FileCheck %s
 
 # machine copy prop should not introduce use before def
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
index 5633884baccf6..5afaa7065f87f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn -run-pass=si-memory-legalizer  %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=si-memory-legalizer  %s -o - | FileCheck %s
 
 --- |
   declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll
index a476a5830ffad..9b9aa709a9c07 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GCN %s
 
 ; Effectively, check that the compile finishes; in the case
 ; of an infinite loop, llc toggles between merging 2 ST4s
@@ -9,23 +9,23 @@
 define amdgpu_kernel void @_Z6brokenPd(ptr %arg) {
 ; GCN-LABEL: _Z6brokenPd:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GCN-NEXT:    s_add_u32 s0, s0, s17
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    s_add_u32 s4, s4, 4
-; GCN-NEXT:    s_addc_u32 s5, s5, 0
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_addc_u32 s5, s5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x7ff80000
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    flat_store_dword v[0:1], v3
+; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 bb:
   %tmp = alloca double, align 8, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 10d4eb029ee35..c89279d3bf81c 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
@@ -12,10 +12,10 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_bfe_i32 s4, s4, 0x180000
+; SI-NEXT:    s_bfe_i32 s5, s5, 0x180000
+; SI-NEXT:    s_mul_i32 s4, s4, s5
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_bfe_i32 s2, s4, 0x180000
-; SI-NEXT:    s_bfe_i32 s4, s5, 0x180000
-; SI-NEXT:    s_mul_i32 s4, s2, s4
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -101,9 +101,9 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    v_mul_hi_i32_i24_e32 v0, s2, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -270,17 +270,17 @@ define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
 ; SI-LABEL: test_smul24_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
-; SI-NEXT:    s_load_dword s4, s[4:5], 0x1c
+; SI-NEXT:    s_load_dword s7, s[4:5], 0x1c
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_i32 s5, s6, 0x180000
-; SI-NEXT:    s_bfe_i32 s4, s4, 0x180000
-; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    s_mul_i32 s5, s4, s5
-; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, s4, v0
+; SI-NEXT:    s_bfe_i32 s4, s6, 0x180000
+; SI-NEXT:    s_bfe_i32 s5, s7, 0x180000
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, s5, v0
+; SI-NEXT:    s_mul_i32 s5, s5, s4
 ; SI-NEXT:    v_mov_b32_e32 v0, s5
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -378,9 +378,9 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a,
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_bfe_i32 s4, s6, 0x180000
-; SI-NEXT:    s_mul_i32 s5, s4, s4
 ; SI-NEXT:    v_mul_hi_i32_i24_e64 v1, s4, s4
-; SI-NEXT:    v_mov_b32_e32 v0, s5
+; SI-NEXT:    s_mul_i32 s4, s4, s4
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -462,17 +462,16 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_bfe_i32 s4, s4, 0x180000
-; SI-NEXT:    s_bfe_i32 s5, s6, 0x180000
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    s_mul_i32 s4, s5, s4
-; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, s5, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    s_bfe_i32 s2, s2, 0x180000
+; SI-NEXT:    s_bfe_i32 s3, s4, 0x180000
+; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, s2, v0
+; SI-NEXT:    s_mul_i32 s2, s2, s3
+; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 31
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], 31
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -573,10 +572,10 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mul_hi_i32_i24_e32 v0, s6, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -669,11 +668,11 @@ define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0,
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_bfe_i32 s0, s0, 0x180000
 ; SI-NEXT:    s_bfe_i32 s1, s2, 0x180000
 ; SI-NEXT:    s_mul_i32 s0, s0, s1
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -800,15 +799,15 @@ bb7:
 define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) {
 ; SI-LABEL: test_umul_i24:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
 ; SI-NEXT:    v_mov_b32_e32 v0, 0xff803fe1
-; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshr_b32 s2, s2, 9
-; SI-NEXT:    s_mul_i32 s4, s2, 0xff803fe1
+; SI-NEXT:    s_lshr_b32 s2, s0, 9
 ; SI-NEXT:    v_mul_hi_u32 v1, s2, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    s_mul_i32 s2, s2, 0xff803fe1
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 1
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 1165401a93af8..d7d78b8450053 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
@@ -12,10 +12,10 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_and_b32 s4, s4, 0xffffff
+; SI-NEXT:    s_and_b32 s5, s5, 0xffffff
+; SI-NEXT:    s_mul_i32 s4, s4, s5
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_and_b32 s2, s4, 0xffffff
-; SI-NEXT:    s_and_b32 s4, s5, 0xffffff
-; SI-NEXT:    s_mul_i32 s4, s2, s4
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -61,14 +61,14 @@ entry:
 define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i16 %b) {
 ; SI-LABEL: test_umul24_i16_sext:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshr_b32 s4, s2, 16
-; SI-NEXT:    s_mul_i32 s2, s2, s4
-; SI-NEXT:    s_sext_i32_i16 s4, s2
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s4, s6, 16
+; SI-NEXT:    s_mul_i32 s6, s6, s4
+; SI-NEXT:    s_sext_i32_i16 s4, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -113,13 +113,13 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_mov_b32 s11, s7
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v3, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
-; SI-NEXT:    s_mov_b32 s11, s7
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
 ; SI-NEXT:    s_mov_b32 s6, -1
@@ -183,14 +183,14 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr
 define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b) {
 ; SI-LABEL: test_umul24_i16:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshr_b32 s4, s2, 16
-; SI-NEXT:    s_mul_i32 s2, s2, s4
-; SI-NEXT:    s_and_b32 s4, s2, 0xffff
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s4, s6, 16
+; SI-NEXT:    s_mul_i32 s6, s6, s4
+; SI-NEXT:    s_and_b32 s4, s6, 0xffff
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -235,13 +235,13 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_mov_b32 s11, s7
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v3, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
-; SI-NEXT:    s_mov_b32 s11, s7
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
 ; SI-NEXT:    s_mov_b32 s6, -1
@@ -303,17 +303,17 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs
 define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
 ; SI-LABEL: test_umul24_i8_vgpr:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    v_mov_b32_e32 v3, v0
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    v_mov_b32_e32 v3, v0
 ; SI-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-NEXT:    s_mov_b32 s14, 0
 ; SI-NEXT:    s_mov_b32 s15, s11
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    s_mov_b64 s[6:7], s[14:15]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    s_mov_b64 s[6:7], s[14:15]
 ; SI-NEXT:    buffer_load_ubyte v0, v[3:4], s[12:15], 0 addr64
 ; SI-NEXT:    buffer_load_ubyte v1, v[1:2], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b32 s10, -1
@@ -380,9 +380,9 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a,
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    v_mul_hi_u32_u24_e32 v0, s2, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -434,9 +434,9 @@ define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    v_mul_hi_u32_u24_e32 v0, s2, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -496,8 +496,8 @@ define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b
 ; SI-NEXT:    s_and_b32 s0, s2, 0xffffff
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_and_b32 s1, s3, 0xffffff
-; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    s_mul_i32 s0, s0, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    v_mul_hi_u32_u24_e32 v1, s2, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -630,11 +630,11 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_mul_i32 s4, s4, s5
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_and_b32 s2, s4, 0xffff
-; SI-NEXT:    s_and_b32 s4, s5, 0xffff
-; SI-NEXT:    s_mul_i32 s2, s2, s4
-; SI-NEXT:    s_lshr_b32 s4, s2, 16
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -679,19 +679,19 @@ entry:
 define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) {
 ; SI-LABEL: test_umul24_i33:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dword s4, s[4:5], 0xd
+; SI-NEXT:    s_load_dword s7, s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s5, s6, 0xffffff
-; SI-NEXT:    s_and_b32 s7, s4, 0xffffff
-; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    s_and_b32 s4, s6, 0xffffff
+; SI-NEXT:    s_and_b32 s5, s7, 0xffffff
+; SI-NEXT:    v_mov_b32_e32 v0, s7
+; SI-NEXT:    s_mul_i32 s4, s4, s5
 ; SI-NEXT:    v_mul_hi_u32_u24_e32 v0, s6, v0
-; SI-NEXT:    s_mul_i32 s5, s5, s7
 ; SI-NEXT:    v_and_b32_e32 v1, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s5
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index 1fad8f37cc28c..da012e6ff907d 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -mtriple=amdgcn-- -lowerswitch -structurizecfg -si-annotate-control-flow < %s | FileCheck -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GCN %s
 
 ; Ensure two if.break calls, for both the inner and outer loops
 ; FIXME: duplicate comparison
@@ -58,10 +58,10 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 ; GCN-NEXT:  .LBB0_2: ; %LOOP.outer
 ; GCN-NEXT:    ; =>This Loop Header: Depth=1
 ; GCN-NEXT:    ; Child Loop BB0_4 Depth 2
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; GCN-NEXT:    ; implicit-def: $sgpr8_sgpr9
 ; GCN-NEXT:    ; implicit-def: $sgpr6_sgpr7
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-NEXT:    s_branch .LBB0_4
 ; GCN-NEXT:  .LBB0_3: ; %Flow
 ; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
@@ -84,10 +84,10 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 ; GCN-NEXT:  ; %bb.5: ; %ENDIF
 ; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
-; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v0
 ; GCN-NEXT:    s_andn2_b64 s[8:9], s[8:9], exec
 ; GCN-NEXT:    s_and_b64 s[12:13], vcc, exec
+; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
 ; GCN-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GCN-NEXT:    s_branch .LBB0_3
 ; GCN-NEXT:  .LBB0_6: ; %IF
@@ -182,10 +182,10 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s8, v1
 ; GCN-NEXT:    s_mov_b64 s[4:5], -1
-; GCN-NEXT:    s_cmp_lt_i32 s8, 1
 ; GCN-NEXT:    s_mov_b64 s[6:7], -1
+; GCN-NEXT:    v_readfirstlane_b32 s8, v1
+; GCN-NEXT:    s_cmp_lt_i32 s8, 1
 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_6
 ; GCN-NEXT:  ; %bb.3: ; %LeafBlock1
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
@@ -212,8 +212,8 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
 ; GCN-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-NEXT:    s_branch .LBB1_1
diff --git a/llvm/test/CodeGen/AMDGPU/no-limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/no-limit-coalesce.mir
index 92249eca38d20..2e683b53818c3 100644
--- a/llvm/test/CodeGen/AMDGPU/no-limit-coalesce.mir
+++ b/llvm/test/CodeGen/AMDGPU/no-limit-coalesce.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
-# RUN: llc -mtriple=amdgcn -run-pass register-coalescer -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass register-coalescer -o - %s | FileCheck %s
 
 # Check that coalescer may create wider register tuple than in source.
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
index 48ac1c60550d7..7626732ecb4bf 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass si-fix-sgpr-copies,si-fold-operands,dead-mi-elimination -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass si-fix-sgpr-copies,si-fold-operands,dead-mi-elimination -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
 # Check that constant is in SGPR registers
 
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir b/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir
index 6be3e592eee45..c5e54eac6aed8 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -run-pass=si-optimize-exec-masking-pre-ra -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 -run-pass=si-optimize-exec-masking-pre-ra -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
 
 # Check for regression from assuming an instruction was a copy after
 # dropping the opcode check.
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
index 0887f41b7db97..a8b2af48e5c98 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GCN %s
 
 define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
 ; GCN-LABEL: negated_cond:
@@ -7,8 +7,8 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s10, -1
-; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s8, s4
 ; GCN-NEXT:    s_mov_b32 s9, s5
@@ -22,12 +22,12 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
 ; GCN-NEXT:    ; =>This Loop Header: Depth=1
 ; GCN-NEXT:    ; Child Loop BB0_4 Depth 2
 ; GCN-NEXT:    buffer_load_dword v1, off, s[8:11], 0
+; GCN-NEXT:    s_mov_b32 s12, s6
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v1
-; GCN-NEXT:    s_mov_b32 s12, s6
 ; GCN-NEXT:    s_branch .LBB0_4
 ; GCN-NEXT:  .LBB0_3: ; %Flow1
 ; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
@@ -55,13 +55,13 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
 ; GCN-NEXT:  ; %bb.8: ; %bb4
 ; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
 ; GCN-NEXT:    s_ashr_i32 s13, s12, 31
-; GCN-NEXT:    s_lshl_b64 s[16:17], s[12:13], 2
-; GCN-NEXT:    s_mov_b64 s[14:15], 0
-; GCN-NEXT:    v_mov_b32_e32 v1, s16
-; GCN-NEXT:    v_mov_b32_e32 v2, s17
-; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_lshl_b64 s[14:15], s[12:13], 2
+; GCN-NEXT:    v_mov_b32_e32 v1, s14
 ; GCN-NEXT:    s_cmp_eq_u32 s12, 32
+; GCN-NEXT:    v_mov_b32_e32 v2, s15
+; GCN-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
 ; GCN-NEXT:    s_branch .LBB0_3
 ; GCN-NEXT:  .LBB0_9: ; %DummyReturnBlock
 ; GCN-NEXT:    s_endpgm
@@ -94,16 +94,16 @@ define amdgpu_kernel void @negated_cond_dominated_blocks(ptr addrspace(1) %arg1)
 ; GCN-LABEL: negated_cond_dominated_blocks:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_mov_b32 s3, s6
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b32 s3, s6
 ; GCN-NEXT:    s_branch .LBB1_2
 ; GCN-NEXT:  .LBB1_1: ; %bb7
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
@@ -112,8 +112,8 @@ define amdgpu_kernel void @negated_cond_dominated_blocks(ptr addrspace(1) %arg1)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s8
 ; GCN-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NEXT:    s_cmp_eq_u32 s2, 32
-; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
 ; GCN-NEXT:    s_mov_b32 s3, s2
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_6
 ; GCN-NEXT:  .LBB1_2: ; %bb4
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -122,12 +122,10 @@ define amdgpu_kernel void @negated_cond_dominated_blocks(ptr addrspace(1) %arg1)
 ; GCN-NEXT:  ; %bb.3: ; %bb6
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_add_i32 s2, s3, 1
-; GCN-NEXT:    s_mov_b64 vcc, exec
 ; GCN-NEXT:    s_cbranch_execnz .LBB1_1
 ; GCN-NEXT:    s_branch .LBB1_5
 ; GCN-NEXT:  .LBB1_4: ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    ; implicit-def: $sgpr2
-; GCN-NEXT:    s_mov_b64 vcc, 0
 ; GCN-NEXT:  .LBB1_5: ; %bb5
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_lshl_b32 s2, s3, 5
diff --git a/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir b/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir
index c9645c31aad75..2eed55c01a90e 100644
--- a/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir
+++ b/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass peephole-opt -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass peephole-opt -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
 # Check that when we jump through several subregisters in sequence of
 # reg_sequence we can still find a plain src for a copy.
diff --git a/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir b/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir
index bb86ab1b3adea..adec09e663f43 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck %s
 
 # The wrong form of scavengeRegister was used, so it wasn't accounting
 # for the iterator passed to eliminateFrameIndex. It was instead using
diff --git a/llvm/test/CodeGen/AMDGPU/perfhint.ll b/llvm/test/CodeGen/AMDGPU/perfhint.ll
index 3ff9e6a3b3da4..2402ca0ef402e 100644
--- a/llvm/test/CodeGen/AMDGPU/perfhint.ll
+++ b/llvm/test/CodeGen/AMDGPU/perfhint.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-perf-hint < %s | FileCheck -check-prefix=CHECK %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-perf-hint < %s | FileCheck -check-prefix=CHECK %s
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}test_membound:
 ; GCN: MemoryBound: 1
diff --git a/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll b/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll
index e941186541642..67c796029cbf3 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefix=GCN %s
 
 define amdgpu_kernel void @rcp_uint(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: rcp_uint:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GCN-NEXT:    s_mov_b32 s4, s2
 ; GCN-NEXT:    s_mov_b32 s5, s3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -27,10 +27,10 @@ define amdgpu_kernel void @rcp_sint(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; GCN-LABEL: rcp_sint:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GCN-NEXT:    s_mov_b32 s4, s2
 ; GCN-NEXT:    s_mov_b32 s5, s3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -49,19 +49,19 @@ define amdgpu_kernel void @rcp_uint_denorm(ptr addrspace(1) %in, ptr addrspace(1
 ; GCN-LABEL: rcp_uint_denorm:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GCN-NEXT:    s_mov_b32 s4, s2
 ; GCN-NEXT:    s_mov_b32 s5, s3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s0
 ; GCN-NEXT:    v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v1
-; GCN-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GCN-NEXT:    v_fma_f32 v2, v3, v2, v2
 ; GCN-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GCN-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; GCN-NEXT:    v_fma_f32 v2, v4, v2, v2
 ; GCN-NEXT:    v_mul_f32_e32 v4, v3, v2
 ; GCN-NEXT:    v_fma_f32 v5, -v1, v4, v3
 ; GCN-NEXT:    v_fma_f32 v4, v5, v2, v4
@@ -81,19 +81,19 @@ define amdgpu_kernel void @rcp_sint_denorm(ptr addrspace(1) %in, ptr addrspace(1
 ; GCN-LABEL: rcp_sint_denorm:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GCN-NEXT:    s_mov_b32 s4, s2
 ; GCN-NEXT:    s_mov_b32 s5, s3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GCN-NEXT:    v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v1
-; GCN-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GCN-NEXT:    v_fma_f32 v2, v3, v2, v2
 ; GCN-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GCN-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; GCN-NEXT:    v_fma_f32 v2, v4, v2, v2
 ; GCN-NEXT:    v_mul_f32_e32 v4, v3, v2
 ; GCN-NEXT:    v_fma_f32 v5, -v1, v4, v3
 ; GCN-NEXT:    v_fma_f32 v4, v5, v2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/regcoalesce-cannot-join-failures.mir b/llvm/test/CodeGen/AMDGPU/regcoalesce-cannot-join-failures.mir
index 6c556433088c5..f36720beca4a7 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoalesce-cannot-join-failures.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoalesce-cannot-join-failures.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-coalescing -run-pass=register-coalescer -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-coalescing -run-pass=register-coalescer -verify-machineinstrs -o - %s | FileCheck %s
 
 ---
 name: couldnt_join_subrange_implicit_def_pred_block
diff --git a/llvm/test/CodeGen/AMDGPU/regcoalesce-keep-valid-lanes-implicit-def-bug39602.mir b/llvm/test/CodeGen/AMDGPU/regcoalesce-keep-valid-lanes-implicit-def-bug39602.mir
index 18eb5586fdecf..4be94f7599948 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoalesce-keep-valid-lanes-implicit-def-bug39602.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoalesce-keep-valid-lanes-implicit-def-bug39602.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-coalescing -run-pass=register-coalescer -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-coalescing -run-pass=register-coalescer -verify-machineinstrs -o - %s | FileCheck %s
 
 # Bug 39602: Avoid "Couldn't join subrange" error when clearing valid
 # lanes on an implicit_def that later cannot be erased.
diff --git a/llvm/test/CodeGen/AMDGPU/regcoalescer-resolve-lane-conflict-by-subranges.mir b/llvm/test/CodeGen/AMDGPU/regcoalescer-resolve-lane-conflict-by-subranges.mir
index d0245ff1a73ae..b77aae761b2fb 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoalescer-resolve-lane-conflict-by-subranges.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoalescer-resolve-lane-conflict-by-subranges.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass register-coalescer -verify-machineinstrs -o - %s | FileCheck --check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass register-coalescer -verify-machineinstrs -o - %s | FileCheck --check-prefix=GCN %s
 #
 
 
diff --git a/llvm/test/CodeGen/AMDGPU/remat-dead-subreg.mir b/llvm/test/CodeGen/AMDGPU/remat-dead-subreg.mir
index c7fa879187cc3..287f61b7321d7 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-dead-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-dead-subreg.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -start-before=greedy -stop-after=virtregrewriter -stress-regalloc=3 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -start-before=greedy -stop-after=virtregrewriter -stress-regalloc=3 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
 # Check that subreg use is live at the point of materialization, not just the main range.
 # Do not rematerialize if used subreg is dead at a new index.
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses-dbg.mir b/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses-dbg.mir
index ede043ce73a47..28a58b8201be5 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses-dbg.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses-dbg.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-enable-rewrite-partial-reg-uses=true -verify-machineinstrs -start-before=rename-independent-subregs -stop-after=rewrite-partial-reg-uses %s -o - | FileCheck -check-prefix=CHECK %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -passes="rename-independent-subregs,amdgpu-rewrite-partial-reg-uses" %s -o - | FileCheck -check-prefix=CHECK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -amdgpu-enable-rewrite-partial-reg-uses=true -verify-machineinstrs -start-before=rename-independent-subregs -stop-after=rewrite-partial-reg-uses %s -o - | FileCheck -check-prefix=CHECK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -passes="rename-independent-subregs,amdgpu-rewrite-partial-reg-uses" %s -o - | FileCheck -check-prefix=CHECK %s
 --- |
   define void @test_vreg_96_w64() !dbg !5 {
   entry:
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses-gen.mir b/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses-gen.mir
index 79e9ce2737695..86877696b9458 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses-gen.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses-gen.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-enable-rewrite-partial-reg-uses=true -verify-machineinstrs -start-before=rename-independent-subregs -stop-after=rewrite-partial-reg-uses %s -o - | FileCheck -check-prefix=CHECK %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -passes="rename-independent-subregs,amdgpu-rewrite-partial-reg-uses" %s -o - | FileCheck -check-prefix=CHECK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -amdgpu-enable-rewrite-partial-reg-uses=true -verify-machineinstrs -start-before=rename-independent-subregs -stop-after=rewrite-partial-reg-uses %s -o - | FileCheck -check-prefix=CHECK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -passes="rename-independent-subregs,amdgpu-rewrite-partial-reg-uses" %s -o - | FileCheck -check-prefix=CHECK %s
 ---
 name: test_subregs_composition_vreg_1024
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses.mir b/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses.mir
index 33007ee8a7c38..90b0430d82d66 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-enable-rewrite-partial-reg-uses=true -verify-machineinstrs -start-before=rename-independent-subregs -stop-after=rewrite-partial-reg-uses %s -o - | FileCheck -check-prefix=CHECK %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -passes="rename-independent-subregs,amdgpu-rewrite-partial-reg-uses" %s -o - | FileCheck -check-prefix=CHECK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -amdgpu-enable-rewrite-partial-reg-uses=true -verify-machineinstrs -start-before=rename-independent-subregs -stop-after=rewrite-partial-reg-uses %s -o - | FileCheck -check-prefix=CHECK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -passes="rename-independent-subregs,amdgpu-rewrite-partial-reg-uses" %s -o - | FileCheck -check-prefix=CHECK %s
 ---
 name: test_subregs_composition_vreg_1024
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/rotate-add.ll b/llvm/test/CodeGen/AMDGPU/rotate-add.ll
index a295b1a5acab4..dfb32f8db21ea 100644
--- a/llvm/test/CodeGen/AMDGPU/rotate-add.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotate-add.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=SI %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
 
 target triple = "nvptx64-nvidia-cuda"
@@ -184,23 +184,23 @@ define i64 @test_rotl_udiv_special_case(i64 %i) {
 ; SI-LABEL: test_rotl_udiv_special_case:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaaa
-; SI-NEXT:    s_mov_b32 s5, 0xaaaaaaab
-; SI-NEXT:    v_mul_hi_u32 v2, v0, s4
-; SI-NEXT:    v_mul_lo_u32 v3, v0, s4
-; SI-NEXT:    v_mul_hi_u32 v4, v1, s5
-; SI-NEXT:    v_mul_lo_u32 v5, v1, s5
-; SI-NEXT:    v_mul_hi_u32 v0, v0, s5
-; SI-NEXT:    v_mul_hi_u32 v6, v1, s4
-; SI-NEXT:    v_mul_lo_u32 v1, v1, s4
-; SI-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
+; SI-NEXT:    v_mul_lo_u32 v3, v1, s4
+; SI-NEXT:    v_mul_hi_u32 v4, v0, s4
+; SI-NEXT:    s_mov_b32 s6, 0xaaaaaaaa
+; SI-NEXT:    v_mul_hi_u32 v5, v1, s4
+; SI-NEXT:    v_mul_hi_u32 v2, v0, s6
+; SI-NEXT:    v_mul_lo_u32 v0, v0, s6
+; SI-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; SI-NEXT:    v_mul_lo_u32 v2, v1, s6
+; SI-NEXT:    v_mul_hi_u32 v1, v1, s6
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; SI-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
-; SI-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
-; SI-NEXT:    v_addc_u32_e32 v3, vcc, v6, v3, vcc
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v0
+; SI-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
 ; SI-NEXT:    v_lshr_b64 v[0:1], v[2:3], 5
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 27, v2
 ; SI-NEXT:    v_and_b32_e32 v0, 0xf0000000, v0
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index a7fcb6439703a..3d2c268416f30 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600 %s
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
@@ -24,15 +24,15 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_sub_i32 s4, 32, s3
-; SI-NEXT:    s_mov_b32 s3, s2
-; SI-NEXT:    s_and_b32 s4, s4, 31
-; SI-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
 ; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_sub_i32 s0, 32, s3
+; SI-NEXT:    s_and_b32 s0, s0, 31
+; SI-NEXT:    s_mov_b32 s3, s2
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -107,17 +107,17 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_sub_i32 s6, 32, s2
-; SI-NEXT:    s_sub_i32 s8, 32, s3
+; SI-NEXT:    s_sub_i32 s8, 32, s2
+; SI-NEXT:    s_sub_i32 s2, 32, s3
+; SI-NEXT:    s_and_b32 s9, s2, 31
 ; SI-NEXT:    s_mov_b32 s2, s1
 ; SI-NEXT:    s_mov_b32 s3, s1
+; SI-NEXT:    s_lshr_b64 s[2:3], s[2:3], s9
+; SI-NEXT:    s_and_b32 s3, s8, 31
 ; SI-NEXT:    s_mov_b32 s1, s0
-; SI-NEXT:    s_and_b32 s8, s8, 31
-; SI-NEXT:    s_and_b32 s6, s6, 31
-; SI-NEXT:    s_lshr_b64 s[2:3], s[2:3], s8
-; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s6
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s3
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -222,29 +222,29 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
 ; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_sub_i32 s2, 32, s12
-; SI-NEXT:    s_sub_i32 s12, 32, s13
-; SI-NEXT:    s_sub_i32 s13, 32, s14
-; SI-NEXT:    s_sub_i32 s14, 32, s15
+; SI-NEXT:    s_sub_i32 s4, 32, s15
+; SI-NEXT:    s_and_b32 s7, s4, 31
 ; SI-NEXT:    s_mov_b32 s4, s11
 ; SI-NEXT:    s_mov_b32 s5, s11
+; SI-NEXT:    s_sub_i32 s6, 32, s14
+; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
+; SI-NEXT:    s_sub_i32 s13, 32, s13
+; SI-NEXT:    s_and_b32 s5, s6, 31
 ; SI-NEXT:    s_mov_b32 s11, s10
-; SI-NEXT:    s_mov_b32 s6, s9
-; SI-NEXT:    s_mov_b32 s7, s9
+; SI-NEXT:    s_sub_i32 s12, 32, s12
+; SI-NEXT:    s_lshr_b64 s[6:7], s[10:11], s5
+; SI-NEXT:    s_and_b32 s5, s13, 31
+; SI-NEXT:    s_mov_b32 s10, s9
+; SI-NEXT:    s_mov_b32 s11, s9
+; SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], s5
+; SI-NEXT:    s_and_b32 s5, s12, 31
 ; SI-NEXT:    s_mov_b32 s9, s8
-; SI-NEXT:    s_and_b32 s14, s14, 31
-; SI-NEXT:    s_and_b32 s13, s13, 31
-; SI-NEXT:    s_and_b32 s12, s12, 31
-; SI-NEXT:    s_and_b32 s2, s2, 31
-; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
-; SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], s13
-; SI-NEXT:    s_lshr_b64 s[6:7], s[6:7], s12
-; SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], s2
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], s5
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v3, s4
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -409,9 +409,9 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 15, v2
 ; SI-NEXT:    v_sub_i32_e32 v2, vcc, 0, v2
+; SI-NEXT:    v_and_b32_e32 v2, 15, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, v1, v0
-; SI-NEXT:    v_and_b32_e32 v2, 15, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, v2, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_short v0, v[4:5], s[4:7], 0 addr64 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index 581312dd3e73f..7df102fd9c856 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600 %s
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
@@ -22,14 +22,14 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s3, 31
-; SI-NEXT:    s_mov_b32 s3, s2
-; SI-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
 ; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_and_b32 s0, s3, 31
+; SI-NEXT:    s_mov_b32 s3, s2
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -96,17 +96,17 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_and_b32 s3, s3, 31
 ; SI-NEXT:    s_mov_b32 s8, s1
 ; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_and_b32 s6, s2, 31
+; SI-NEXT:    s_and_b32 s2, s2, 31
 ; SI-NEXT:    s_mov_b32 s1, s0
-; SI-NEXT:    s_lshr_b64 s[2:3], s[8:9], s3
-; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s6
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], s3
+; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v1, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s8
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -195,25 +195,25 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
 ; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s2, s15, 31
+; SI-NEXT:    s_and_b32 s6, s15, 31
 ; SI-NEXT:    s_mov_b32 s4, s11
 ; SI-NEXT:    s_mov_b32 s5, s11
-; SI-NEXT:    s_and_b32 s14, s14, 31
+; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; SI-NEXT:    s_and_b32 s5, s14, 31
 ; SI-NEXT:    s_mov_b32 s11, s10
-; SI-NEXT:    s_and_b32 s13, s13, 31
-; SI-NEXT:    s_mov_b32 s6, s9
-; SI-NEXT:    s_mov_b32 s7, s9
-; SI-NEXT:    s_and_b32 s12, s12, 31
+; SI-NEXT:    s_lshr_b64 s[6:7], s[10:11], s5
+; SI-NEXT:    s_and_b32 s5, s13, 31
+; SI-NEXT:    s_mov_b32 s10, s9
+; SI-NEXT:    s_mov_b32 s11, s9
+; SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], s5
+; SI-NEXT:    s_and_b32 s5, s12, 31
 ; SI-NEXT:    s_mov_b32 s9, s8
-; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s2
-; SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], s14
-; SI-NEXT:    s_lshr_b64 s[6:7], s[6:7], s13
-; SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], s12
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], s5
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v3, s4
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -337,34 +337,34 @@ define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s24, s19, 31
+; SI-NEXT:    s_and_b32 s6, s19, 31
 ; SI-NEXT:    s_mov_b32 s4, s11
 ; SI-NEXT:    s_mov_b32 s5, s11
-; SI-NEXT:    s_and_b32 s25, s18, 31
+; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; SI-NEXT:    s_and_b32 s5, s18, 31
 ; SI-NEXT:    s_mov_b32 s11, s10
-; SI-NEXT:    s_and_b32 s26, s17, 31
-; SI-NEXT:    s_mov_b32 s6, s9
-; SI-NEXT:    s_mov_b32 s7, s9
-; SI-NEXT:    s_and_b32 s27, s16, 31
+; SI-NEXT:    s_lshr_b64 s[6:7], s[10:11], s5
+; SI-NEXT:    s_and_b32 s5, s17, 31
+; SI-NEXT:    s_mov_b32 s10, s9
+; SI-NEXT:    s_mov_b32 s11, s9
+; SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], s5
+; SI-NEXT:    s_and_b32 s5, s16, 31
 ; SI-NEXT:    s_mov_b32 s9, s8
-; SI-NEXT:    s_and_b32 s23, s23, 31
+; SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], s5
+; SI-NEXT:    s_and_b32 s5, s23, 31
 ; SI-NEXT:    s_mov_b32 s16, s15
 ; SI-NEXT:    s_mov_b32 s17, s15
-; SI-NEXT:    s_and_b32 s22, s22, 31
+; SI-NEXT:    s_lshr_b64 s[16:17], s[16:17], s5
+; SI-NEXT:    s_and_b32 s5, s22, 31
 ; SI-NEXT:    s_mov_b32 s15, s14
-; SI-NEXT:    s_and_b32 s21, s21, 31
+; SI-NEXT:    s_lshr_b64 s[14:15], s[14:15], s5
+; SI-NEXT:    s_and_b32 s5, s21, 31
 ; SI-NEXT:    s_mov_b32 s18, s13
 ; SI-NEXT:    s_mov_b32 s19, s13
-; SI-NEXT:    s_and_b32 s20, s20, 31
+; SI-NEXT:    s_lshr_b64 s[18:19], s[18:19], s5
+; SI-NEXT:    s_and_b32 s5, s20, 31
 ; SI-NEXT:    s_mov_b32 s13, s12
-; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s24
-; SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], s25
-; SI-NEXT:    s_lshr_b64 s[6:7], s[6:7], s26
-; SI-NEXT:    s_lshr_b64 s[16:17], s[16:17], s23
-; SI-NEXT:    s_lshr_b64 s[14:15], s[14:15], s22
-; SI-NEXT:    s_lshr_b64 s[18:19], s[18:19], s21
-; SI-NEXT:    s_lshr_b64 s[12:13], s[12:13], s20
-; SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], s27
+; SI-NEXT:    s_lshr_b64 s[12:13], s[12:13], s5
 ; SI-NEXT:    v_mov_b32_e32 v0, s12
 ; SI-NEXT:    v_mov_b32_e32 v1, s18
 ; SI-NEXT:    v_mov_b32_e32 v2, s14
@@ -372,8 +372,8 @@ define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_mov_b32_e32 v3, s4
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -585,9 +585,9 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 15, v2
 ; SI-NEXT:    v_sub_i32_e32 v2, vcc, 0, v2
+; SI-NEXT:    v_and_b32_e32 v2, 15, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, v1, v0
-; SI-NEXT:    v_and_b32_e32 v2, 15, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_short v0, v[4:5], s[4:7], 0 addr64 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index a82415e2e8303..40634bb5ad2e9 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mattr=-flat-for-global | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx600 -mattr=-flat-for-global | FileCheck %s --check-prefix=SI
 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefixes=GFX89,VI
 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -| FileCheck %s --check-prefixes=GFX89,GFX9
 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -| FileCheck %s --check-prefixes=GFX11
@@ -17,11 +17,11 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add
 ; SI-NEXT:    s_mov_b32 s8, s2
 ; SI-NEXT:    s_mov_b32 s9, s3
 ; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    v_mov_b32_e32 v1, v0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -105,11 +105,11 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add
 ; SI-NEXT:    s_mov_b32 s8, s2
 ; SI-NEXT:    s_mov_b32 s9, s3
 ; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    v_mov_b32_e32 v1, v0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
index 9896e5f4c8cae..32c89bbd4ca01 100644
--- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -o - %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -o - %s | FileCheck -check-prefix=GCN %s
 
 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
 declare i32 @llvm.amdgcn.sffbh.i32(i32) nounwind readnone speculatable
@@ -8,23 +8,23 @@ define amdgpu_kernel void @select_constant_cttz(ptr addrspace(1) noalias %out, p
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s4, 1, s2
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_ff1_i32_b32 s2, s4
+; GCN-NEXT:    s_lshr_b32 s6, 1, s4
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_ff1_i32_b32 s8, s6
 ; GCN-NEXT:    s_and_b64 s[6:7], s[4:5], exec
-; GCN-NEXT:    s_cselect_b32 s2, -1, s2
-; GCN-NEXT:    s_flbit_i32 s6, s2
-; GCN-NEXT:    s_sub_i32 s8, 31, s6
-; GCN-NEXT:    s_cmp_eq_u32 s2, 0
+; GCN-NEXT:    s_cselect_b32 s6, -1, s8
+; GCN-NEXT:    s_flbit_i32 s7, s6
+; GCN-NEXT:    s_sub_i32 s8, 31, s7
+; GCN-NEXT:    s_cmp_eq_u32 s6, 0
 ; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GCN-NEXT:    s_cselect_b32 s4, -1, s8
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-select-hi32mask.ll b/llvm/test/CodeGen/AMDGPU/setcc-select-hi32mask.ll
index 9735b1265975b..c2a38e44c8bc2 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-select-hi32mask.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-select-hi32mask.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck %s
 
 define i32 @select.hi32.sgpr.eq(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
 ; CHECK-LABEL: select.hi32.sgpr.eq:
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-select.ll b/llvm/test/CodeGen/AMDGPU/setcc-select.ll
index 30c669c46ac1a..9fa246632b79f 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-select.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck %s
 
 define i32 @select.hi32.sgpr.ult(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
 ; CHECK-LABEL: select.hi32.sgpr.ult:
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir
index 3b02a2ef717e9..34bf123d58cc1 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn -run-pass si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
 ---
 name:            copy_to_vreg_1
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index b538d6066d551..eb7e9756c0079 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn < %s | FileCheck --check-prefixes=SI,SI-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn < %s | FileCheck --check-prefixes=SI,SI-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefixes=SI,SI-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefixes=SI,SI-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefixes=VI,VI-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefixes=VI,VI-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9,GFX9-SDAG %s
@@ -42,9 +42,9 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -401,9 +401,9 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_sub_i32_e32 v2, vcc, 64, v2
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -514,9 +514,9 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffbf, v2
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -627,9 +627,9 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_sub_i32_e32 v2, vcc, 0x41, v2
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -740,9 +740,9 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 16, v2
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -853,9 +853,9 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_sub_i32_e32 v2, vcc, -16, v2
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -966,9 +966,9 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 17, v2
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -1079,9 +1079,9 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_sub_i32_e32 v2, vcc, 0xffffffef, v2
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -1247,9 +1247,9 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -1405,8 +1405,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-SDAG-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
-; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_subrev_i32_e32 v0, vcc, 64, v3
 ; SI-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -1424,10 +1424,10 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc0, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -1851,10 +1851,10 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v2
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
 ; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xffc00000, v2
 ; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -1870,15 +1870,15 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffc0, v3
-; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -1979,10 +1979,10 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, -7, v2
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
 ; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xffc00000, v2
 ; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -1998,15 +1998,15 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, -7, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffc0, v3
-; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, -7, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -2120,10 +2120,10 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v2
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
 ; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xff850000, v2
 ; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -2139,15 +2139,15 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffff85, v3
-; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -2262,10 +2262,10 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff
 ; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, -7, v2
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff
 ; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v3, v2
 ; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-SDAG-NEXT:    s_endpgm
@@ -2280,13 +2280,13 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, -7, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -2403,14 +2403,14 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, -16, v3
-; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -2525,14 +2525,14 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0x3c00, v3
-; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -2660,14 +2660,14 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffbc00, v3
-; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -2780,10 +2780,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_subrev_i32_e32 v3, vcc, 32, v2
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
 ; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xffe00000, v2
 ; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -2799,15 +2799,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffe0, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffe0, v3
-; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffe0, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -2924,14 +2924,14 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffe0, v3
-; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -3030,10 +3030,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff
 ; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_subrev_i32_e32 v3, vcc, 32, v2
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff
 ; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v3, v2
 ; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-SDAG-NEXT:    s_endpgm
@@ -3048,13 +3048,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffe0, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -3155,10 +3155,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, -16, v2
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
 ; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
 ; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -3174,15 +3174,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, -16, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, -16, v3
-; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, -16, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -3299,14 +3299,14 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, -16, v3
-; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -3405,10 +3405,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff
 ; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, -16, v2
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff
 ; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v3, v2
 ; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-SDAG-NEXT:    s_endpgm
@@ -3423,13 +3423,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, -16, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -3529,10 +3529,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc400, v2
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
 ; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0xc4000000, v2
 ; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -3548,15 +3548,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffc400, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc400, v3
-; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffc400, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -3694,10 +3694,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 0x4400, v2
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
 ; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 0x44000000, v2
 ; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -3713,15 +3713,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0x4400, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0x4400, v3
-; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0x4400, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -3859,10 +3859,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 0x4000, v2
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
 ; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 2.0, v2
 ; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -3878,15 +3878,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0x4000, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0x4000, v3
-; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0x4000, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -3987,10 +3987,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SI-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc000, v2
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xffff0000
 ; SI-SDAG-NEXT:    v_bfi_b32 v2, s4, v2, v3
 ; SI-SDAG-NEXT:    v_add_i32_e32 v2, vcc, -2.0, v2
 ; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -4006,15 +4006,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffc000, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc000, v3
-; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffc000, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -4132,12 +4132,12 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffe0, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -4280,10 +4280,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffe0, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
index 01bcdad3fc220..022ea8e39a1d2 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -p simplifycfg,amdgpu-unify-divergent-exit-nodes %s -S -o - | FileCheck %s --check-prefix=OPT
-; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 %s -o - | FileCheck %s --check-prefix=ISA
 
 define void @nested_inf_loop(i1 %0, i1 %1) {
 ; OPT-LABEL: define void @nested_inf_loop(
@@ -44,7 +44,6 @@ define void @nested_inf_loop(i1 %0, i1 %1) {
 ; ISA-NEXT:  ; %bb.4: ; %loop.exit.guard
 ; ISA-NEXT:    ; in Loop: Header=BB0_1 Depth=1
 ; ISA-NEXT:    s_or_b64 exec, exec, s[8:9]
-; ISA-NEXT:    s_mov_b64 vcc, 0
 ; ISA-NEXT:    s_mov_b64 s[8:9], 0
 ; ISA-NEXT:    s_branch .LBB0_1
 ; ISA-NEXT:  ; %bb.5: ; %DummyReturnBlock
@@ -116,17 +115,16 @@ define void @nested_inf_loop_callbr(i32 %0, i32 %1) {
 ; ISA-NEXT:    ; Label of block must be emitted
 ; ISA-NEXT:    ;;#ASMSTART
 ; ISA-NEXT:    ;;#ASMEND
-; ISA-NEXT:    s_mov_b64 s[6:7], -1
-; ISA-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
-; ISA-NEXT:    s_cbranch_execz .LBB1_5
+; ISA-NEXT:    s_mov_b64 s[8:9], -1
+; ISA-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; ISA-NEXT:  ; %bb.4: ; %TransitionBlock.target.BB3
 ; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; ISA-NEXT:    s_xor_b64 s[6:7], exec, -1
-; ISA-NEXT:  .LBB1_5: ; %loop.exit.guard
+; ISA-NEXT:    s_xor_b64 s[8:9], exec, -1
+; ISA-NEXT:  ; %bb.5: ; %loop.exit.guard
 ; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; ISA-NEXT:    s_or_b64 exec, exec, s[8:9]
-; ISA-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; ISA-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; ISA-NEXT:    s_mov_b64 s[6:7], 0
+; ISA-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; ISA-NEXT:    s_cbranch_vccz .LBB1_2
 ; ISA-NEXT:  ; %bb.6: ; %DummyReturnBlock
 ; ISA-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/si-i1-copies.mir b/llvm/test/CodeGen/AMDGPU/si-i1-copies.mir
index 9618abe1770bf..0dc8e1f9fd562 100644
--- a/llvm/test/CodeGen/AMDGPU/si-i1-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-i1-copies.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=si-i1-copies -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=si-i1-copies -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN %s
 
 # Test that the new IMPLICIT_DEF is inserted in the correct location.
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir
index eddad05d976bd..e94ad35a45800 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=liveintervals,si-lower-control-flow,si-lower-control-flow -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -run-pass=liveintervals,si-lower-control-flow,si-lower-control-flow -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN %s
 
 # Check that assert is not triggered
 
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies.mir b/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies.mir
index c15e0d83f1bd8..f763e8c93ec18 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-i1-copies -o - %s | FileCheck -check-prefixes=GCN %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -passes=si-i1-copies -o - %s | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs -run-pass=si-i1-copies -o - %s | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs -passes=si-i1-copies -o - %s | FileCheck -check-prefixes=GCN %s
 
 ---
 name:              lcssa_phi
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-cycle-header.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-cycle-header.mir
index b34ec250e566d..0829157e079d1 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-cycle-header.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-cycle-header.mir
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
 
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR-SPILL %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -start-before=si-lower-sgpr-spills -stop-after=regallocfast,1 -verify-machineinstrs %s -o - | FileCheck -check-prefix=WWM-REGALLOC %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR-SPILL %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -O0 -start-before=si-lower-sgpr-spills -stop-after=regallocfast,1 -verify-machineinstrs %s -o - | FileCheck -check-prefix=WWM-REGALLOC %s
 
 
 # When SGPR spills to a virtual VGPR lane occur in both a loop header and the latch,
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-initial-insert-in-body.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-initial-insert-in-body.mir
index 3655376703d72..3ee2b101903e3 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-initial-insert-in-body.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-initial-insert-in-body.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR-SPILL %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR-SPILL %s
 
 ---
 name:            sgpr_spill_initial_insert_in_body_moves_to_preheader
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-initial-insert-in-latch.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-initial-insert-in-latch.mir
index eb4f63a4e2b0c..798a2e1e31ede 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-initial-insert-in-latch.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-initial-insert-in-latch.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR-SPILL %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR-SPILL %s
 
 ---
 name:            sgpr_spill_initial_insert_in_latch_moves_to_preheader
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-multi-entry-cycle.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-multi-entry-cycle.mir
index 73dd7637e6e25..ff0c70a158c64 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-multi-entry-cycle.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-multi-entry-cycle.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR-SPILL %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -start-before=si-lower-sgpr-spills -stop-after=regallocfast,1 -verify-machineinstrs %s -o - | FileCheck -check-prefix=WWM-REGALLOC %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR-SPILL %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -O0 -start-before=si-lower-sgpr-spills -stop-after=regallocfast,1 -verify-machineinstrs %s -o - | FileCheck -check-prefix=WWM-REGALLOC %s
 
 # Ensure that for a multi-entry cycle si-lower-sgpr-spills inserts
 # IMPLICIT_DEF into the NCD of the cycle's entries.
diff --git a/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll b/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll
index 77b87371d586b..d55f3644dc875 100644
--- a/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck %s
 
 ; Check we can compile this bugpoint-reduced test without an
 ; infinite loop in TLI.SimplifyDemandedBits() due to failure
@@ -17,37 +17,37 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
 define amdgpu_kernel void @foo(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture readonly %arg1, ptr addrspace(1) noalias nocapture %arg2, float %arg3, i1 %c0, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5) local_unnamed_addr "amdgpu-flat-work-group-size"="128,128" !reqd_work_group_size !0 {
 ; CHECK-LABEL: foo:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CHECK-NEXT:    s_movk_i32 s0, 0x54
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mad_u32_u24 v1, v1, s0, v2
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    s_mov_b32 m0, -1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_bitcmp1_b32 s2, 8
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT:    s_bitcmp1_b32 s2, 16
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; CHECK-NEXT:    s_bitcmp1_b32 s0, 8
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    s_bitcmp1_b32 s0, 16
 ; CHECK-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v2
 ; CHECK-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; CHECK-NEXT:    s_bitcmp1_b32 s2, 24
+; CHECK-NEXT:    s_bitcmp1_b32 s0, 24
 ; CHECK-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; CHECK-NEXT:    s_xor_b64 s[6:7], s[6:7], -1
-; CHECK-NEXT:    s_bitcmp1_b32 s3, 0
+; CHECK-NEXT:    s_bitcmp1_b32 s1, 0
 ; CHECK-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; CHECK-NEXT:    s_bitcmp1_b32 s3, 8
+; CHECK-NEXT:    s_bitcmp1_b32 s1, 8
+; CHECK-NEXT:    s_movk_i32 s0, 0x54
 ; CHECK-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; CHECK-NEXT:    v_mad_u32_u24 v0, v1, s0, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_and_b64 s[2:3], exec, s[6:7]
 ; CHECK-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
 ; CHECK-NEXT:    s_and_b64 s[6:7], exec, s[10:11]
 ; CHECK-NEXT:    s_and_b64 s[8:9], exec, s[8:9]
-; CHECK-NEXT:    s_mov_b32 m0, -1
 ; CHECK-NEXT:  .LBB0_1: ; %.loopexit145
 ; CHECK-NEXT:    ; =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ; Child Loop BB0_3 Depth 2
 ; CHECK-NEXT:    ; Child Loop BB0_4 Depth 3
 ; CHECK-NEXT:    ; Child Loop BB0_5 Depth 2
-; CHECK-NEXT:    v_mov_b32_e32 v2, v1
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
 ; CHECK-NEXT:    s_branch .LBB0_3
 ; CHECK-NEXT:  .LBB0_2: ; %.loopexit
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=2
@@ -65,7 +65,7 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias nocapture readonly %arg,
 ; CHECK-NEXT:    ; Parent Loop BB0_1 Depth=1
 ; CHECK-NEXT:    ; Parent Loop BB0_3 Depth=2
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=3
-; CHECK-NEXT:    ds_write_b32 v3, v0
+; CHECK-NEXT:    ds_write_b32 v3, v1
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 32, v3
 ; CHECK-NEXT:    s_mov_b64 vcc, s[2:3]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_4
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index 8c823849616b9..ce9ec57b0cb82 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
@@ -10,25 +10,23 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i
 ; GFX6-LABEL: s_sint_to_fp_i64_to_f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    s_flbit_i32 s0, s3
-; GFX6-NEXT:    s_xor_b32 s1, s2, s3
-; GFX6-NEXT:    s_add_i32 s0, s0, -1
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 31
-; GFX6-NEXT:    s_add_i32 s1, s1, 32
-; GFX6-NEXT:    s_min_u32 s8, s0, s1
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[2:3], s8
-; GFX6-NEXT:    s_min_u32 s0, s0, 1
-; GFX6-NEXT:    s_or_b32 s0, s1, s0
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GFX6-NEXT:    s_sub_i32 s0, 32, s8
-; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s0
+; GFX6-NEXT:    s_xor_b32 s5, s2, s3
+; GFX6-NEXT:    s_flbit_i32 s4, s3
+; GFX6-NEXT:    s_ashr_i32 s5, s5, 31
+; GFX6-NEXT:    s_add_i32 s4, s4, -1
+; GFX6-NEXT:    s_add_i32 s5, s5, 32
+; GFX6-NEXT:    s_min_u32 s4, s4, s5
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX6-NEXT:    s_min_u32 s2, s2, 1
+; GFX6-NEXT:    s_or_b32 s2, s3, s2
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX6-NEXT:    s_sub_i32 s2, 32, s4
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_sint_to_fp_i64_to_f16:
@@ -120,23 +118,23 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_xor_b32_e32 v0, v3, v4
+; GFX6-NEXT:    v_xor_b32_e32 v1, v3, v4
 ; GFX6-NEXT:    v_ffbh_i32_e32 v5, v4
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 32, v1
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, -1, v5
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
-; GFX6-NEXT:    v_min_u32_e32 v0, v5, v0
-; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
+; GFX6-NEXT:    v_min_u32_e32 v1, v5, v1
+; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v1
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
 ; GFX6-NEXT:    v_min_u32_e32 v3, 1, v3
 ; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 32, v0
-; GFX6-NEXT:    v_ldexp_f32_e32 v0, v3, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    buffer_store_short v0, v[1:2], s[0:3], 0 addr64
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v3, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX6-NEXT:    buffer_store_short v3, v[1:2], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: v_sint_to_fp_i64_to_f16:
@@ -246,19 +244,19 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_xor_b32 s5, s2, s3
+; GFX6-NEXT:    s_flbit_i32 s4, s3
+; GFX6-NEXT:    s_ashr_i32 s5, s5, 31
+; GFX6-NEXT:    s_add_i32 s4, s4, -1
+; GFX6-NEXT:    s_add_i32 s5, s5, 32
+; GFX6-NEXT:    s_min_u32 s8, s4, s5
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX6-NEXT:    s_min_u32 s2, s2, 1
+; GFX6-NEXT:    s_or_b32 s2, s3, s2
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    s_flbit_i32 s0, s3
-; GFX6-NEXT:    s_xor_b32 s1, s2, s3
-; GFX6-NEXT:    s_add_i32 s0, s0, -1
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 31
-; GFX6-NEXT:    s_add_i32 s1, s1, 32
-; GFX6-NEXT:    s_min_u32 s8, s0, s1
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[2:3], s8
-; GFX6-NEXT:    s_min_u32 s0, s0, 1
-; GFX6-NEXT:    s_or_b32 s0, s1, s0
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GFX6-NEXT:    s_sub_i32 s0, 32, s8
+; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -323,21 +321,21 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_xor_b32_e32 v0, v3, v4
+; GFX6-NEXT:    v_xor_b32_e32 v1, v3, v4
 ; GFX6-NEXT:    v_ffbh_i32_e32 v5, v4
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 32, v1
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, -1, v5
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
-; GFX6-NEXT:    v_min_u32_e32 v0, v5, v0
-; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
-; GFX6-NEXT:    v_min_u32_e32 v3, 1, v3
-; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 32, v0
-; GFX6-NEXT:    v_ldexp_f32_e32 v0, v3, v0
+; GFX6-NEXT:    v_min_u32_e32 v5, v5, v1
+; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT:    v_min_u32_e32 v0, 1, v3
+; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 32, v5
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v3
 ; GFX6-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -411,36 +409,36 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
 define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 x i64> %in) #0{
 ; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_flbit_i32 s4, s11
-; GFX6-NEXT:    s_xor_b32 s5, s10, s11
-; GFX6-NEXT:    s_flbit_i32 s6, s9
-; GFX6-NEXT:    s_xor_b32 s7, s8, s9
-; GFX6-NEXT:    s_add_i32 s4, s4, -1
-; GFX6-NEXT:    s_ashr_i32 s5, s5, 31
-; GFX6-NEXT:    s_add_i32 s6, s6, -1
-; GFX6-NEXT:    s_ashr_i32 s7, s7, 31
-; GFX6-NEXT:    s_add_i32 s5, s5, 32
-; GFX6-NEXT:    s_add_i32 s7, s7, 32
-; GFX6-NEXT:    s_min_u32 s12, s4, s5
-; GFX6-NEXT:    s_min_u32 s13, s6, s7
-; GFX6-NEXT:    s_lshl_b64 s[4:5], s[10:11], s12
-; GFX6-NEXT:    s_sub_i32 s10, 32, s12
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[8:9], s13
-; GFX6-NEXT:    s_sub_i32 s8, 32, s13
-; GFX6-NEXT:    s_min_u32 s4, s4, 1
-; GFX6-NEXT:    s_min_u32 s6, s6, 1
-; GFX6-NEXT:    s_or_b32 s4, s5, s4
-; GFX6-NEXT:    s_or_b32 s5, s7, s6
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s5
-; GFX6-NEXT:    v_ldexp_f32_e64 v1, v0, s10
-; GFX6-NEXT:    v_ldexp_f32_e64 v0, v2, s8
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT:    s_xor_b32 s9, s2, s3
+; GFX6-NEXT:    s_flbit_i32 s8, s3
+; GFX6-NEXT:    s_ashr_i32 s9, s9, 31
+; GFX6-NEXT:    s_add_i32 s8, s8, -1
+; GFX6-NEXT:    s_add_i32 s9, s9, 32
+; GFX6-NEXT:    s_min_u32 s8, s8, s9
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX6-NEXT:    s_min_u32 s2, s2, 1
+; GFX6-NEXT:    s_or_b32 s2, s3, s2
+; GFX6-NEXT:    s_xor_b32 s3, s0, s1
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX6-NEXT:    s_flbit_i32 s2, s1
+; GFX6-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX6-NEXT:    s_add_i32 s2, s2, -1
+; GFX6-NEXT:    s_add_i32 s3, s3, 32
+; GFX6-NEXT:    s_min_u32 s2, s2, s3
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; GFX6-NEXT:    s_min_u32 s0, s0, 1
+; GFX6-NEXT:    s_or_b32 s0, s1, s0
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
+; GFX6-NEXT:    s_sub_i32 s0, 32, s8
+; GFX6-NEXT:    v_ldexp_f32_e64 v1, v0, s0
+; GFX6-NEXT:    s_sub_i32 s0, 32, s2
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, v2, s0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f32:
@@ -533,21 +531,21 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_waitcnt vmcnt(1)
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v3, v4
-; GFX6-NEXT:    v_ffbh_i32_e32 v9, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v12, v1, v2
-; GFX6-NEXT:    v_ffbh_i32_e32 v13, v2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_xor_b32_e32 v14, v7, v8
-; GFX6-NEXT:    v_ffbh_i32_e32 v15, v8
 ; GFX6-NEXT:    v_xor_b32_e32 v16, v5, v6
+; GFX6-NEXT:    v_ffbh_i32_e32 v9, v4
+; GFX6-NEXT:    v_ffbh_i32_e32 v13, v2
+; GFX6-NEXT:    v_ffbh_i32_e32 v15, v8
 ; GFX6-NEXT:    v_ffbh_i32_e32 v17, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
-; GFX6-NEXT:    v_add_i32_e32 v9, vcc, -1, v9
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v12, 31, v12
-; GFX6-NEXT:    v_add_i32_e32 v13, vcc, -1, v13
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
-; GFX6-NEXT:    v_add_i32_e32 v15, vcc, -1, v15
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v16, 31, v16
+; GFX6-NEXT:    v_add_i32_e32 v9, vcc, -1, v9
+; GFX6-NEXT:    v_add_i32_e32 v13, vcc, -1, v13
+; GFX6-NEXT:    v_add_i32_e32 v15, vcc, -1, v15
 ; GFX6-NEXT:    v_add_i32_e32 v17, vcc, -1, v17
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GFX6-NEXT:    v_add_i32_e32 v12, vcc, 32, v12
@@ -560,11 +558,8 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
 ; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
 ; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, 32, v0
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], v9
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 32, v9
 ; GFX6-NEXT:    v_lshl_b64 v[7:8], v[7:8], v12
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 32, v12
 ; GFX6-NEXT:    v_lshl_b64 v[5:6], v[5:6], v13
-; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, 32, v13
 ; GFX6-NEXT:    v_min_u32_e32 v3, 1, v3
 ; GFX6-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_min_u32_e32 v7, 1, v7
@@ -577,6 +572,9 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 32, v9
+; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 32, v12
+; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, 32, v13
 ; GFX6-NEXT:    v_ldexp_f32_e32 v3, v3, v14
 ; GFX6-NEXT:    v_ldexp_f32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v9
@@ -734,36 +732,36 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_flbit_i32 s4, s11
-; GFX6-NEXT:    s_xor_b32 s5, s10, s11
-; GFX6-NEXT:    s_flbit_i32 s6, s9
-; GFX6-NEXT:    s_xor_b32 s7, s8, s9
-; GFX6-NEXT:    s_add_i32 s4, s4, -1
-; GFX6-NEXT:    s_ashr_i32 s5, s5, 31
-; GFX6-NEXT:    s_add_i32 s6, s6, -1
-; GFX6-NEXT:    s_ashr_i32 s7, s7, 31
-; GFX6-NEXT:    s_add_i32 s5, s5, 32
-; GFX6-NEXT:    s_add_i32 s7, s7, 32
-; GFX6-NEXT:    s_min_u32 s12, s4, s5
-; GFX6-NEXT:    s_min_u32 s13, s6, s7
-; GFX6-NEXT:    s_lshl_b64 s[4:5], s[10:11], s12
-; GFX6-NEXT:    s_sub_i32 s10, 32, s12
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[8:9], s13
-; GFX6-NEXT:    s_sub_i32 s8, 32, s13
-; GFX6-NEXT:    s_min_u32 s4, s4, 1
-; GFX6-NEXT:    s_min_u32 s6, s6, 1
-; GFX6-NEXT:    s_or_b32 s4, s5, s4
-; GFX6-NEXT:    s_or_b32 s5, s7, s6
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
-; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s10
-; GFX6-NEXT:    v_ldexp_f32_e64 v1, v1, s8
+; GFX6-NEXT:    s_xor_b32 s3, s10, s11
+; GFX6-NEXT:    s_flbit_i32 s2, s11
+; GFX6-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX6-NEXT:    s_add_i32 s2, s2, -1
+; GFX6-NEXT:    s_add_i32 s3, s3, 32
+; GFX6-NEXT:    s_min_u32 s4, s2, s3
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[10:11], s4
+; GFX6-NEXT:    s_min_u32 s2, s2, 1
+; GFX6-NEXT:    s_or_b32 s2, s3, s2
+; GFX6-NEXT:    s_xor_b32 s3, s8, s9
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX6-NEXT:    s_flbit_i32 s2, s9
+; GFX6-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX6-NEXT:    s_add_i32 s2, s2, -1
+; GFX6-NEXT:    s_add_i32 s3, s3, 32
+; GFX6-NEXT:    s_min_u32 s5, s2, s3
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[8:9], s5
+; GFX6-NEXT:    s_min_u32 s2, s2, 1
+; GFX6-NEXT:    s_or_b32 s2, s3, s2
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s2
+; GFX6-NEXT:    s_sub_i32 s4, 32, s4
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s4
+; GFX6-NEXT:    s_sub_i32 s2, 32, s5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_ldexp_f32_e64 v1, v1, s2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -904,25 +902,25 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_waitcnt vmcnt(1)
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v3, v4
-; GFX6-NEXT:    v_ffbh_i32_e32 v9, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v12, v1, v2
-; GFX6-NEXT:    v_ffbh_i32_e32 v13, v2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_xor_b32_e32 v14, v7, v8
+; GFX6-NEXT:    v_ffbh_i32_e32 v9, v4
+; GFX6-NEXT:    v_ffbh_i32_e32 v13, v2
 ; GFX6-NEXT:    v_ffbh_i32_e32 v15, v8
 ; GFX6-NEXT:    v_xor_b32_e32 v16, v5, v6
-; GFX6-NEXT:    v_ffbh_i32_e32 v17, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
-; GFX6-NEXT:    v_add_i32_e32 v9, vcc, -1, v9
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v12, 31, v12
-; GFX6-NEXT:    v_add_i32_e32 v13, vcc, -1, v13
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
+; GFX6-NEXT:    v_ffbh_i32_e32 v17, v6
+; GFX6-NEXT:    v_add_i32_e32 v9, vcc, -1, v9
+; GFX6-NEXT:    v_add_i32_e32 v13, vcc, -1, v13
 ; GFX6-NEXT:    v_add_i32_e32 v15, vcc, -1, v15
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v16, 31, v16
-; GFX6-NEXT:    v_add_i32_e32 v17, vcc, -1, v17
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GFX6-NEXT:    v_add_i32_e32 v12, vcc, 32, v12
 ; GFX6-NEXT:    v_add_i32_e32 v14, vcc, 32, v14
+; GFX6-NEXT:    v_add_i32_e32 v17, vcc, -1, v17
 ; GFX6-NEXT:    v_add_i32_e32 v16, vcc, 32, v16
 ; GFX6-NEXT:    v_min_u32_e32 v0, v9, v0
 ; GFX6-NEXT:    v_min_u32_e32 v9, v13, v12
@@ -931,11 +929,8 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
 ; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
 ; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, 32, v0
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], v9
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 32, v9
 ; GFX6-NEXT:    v_lshl_b64 v[7:8], v[7:8], v12
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 32, v12
 ; GFX6-NEXT:    v_lshl_b64 v[5:6], v[5:6], v13
-; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, 32, v13
 ; GFX6-NEXT:    v_min_u32_e32 v3, 1, v3
 ; GFX6-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_min_u32_e32 v7, 1, v7
@@ -945,16 +940,19 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
 ; GFX6-NEXT:    v_or_b32_e32 v1, v8, v7
 ; GFX6-NEXT:    v_or_b32_e32 v4, v6, v5
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 32, v9
+; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 32, v12
+; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, 32, v13
 ; GFX6-NEXT:    v_ldexp_f32_e32 v3, v3, v14
-; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v9
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_ldexp_f32_e32 v2, v4, v12
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
diff --git a/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir b/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir
index 957b25af625ee..4b5d05167d53a 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir
+++ b/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-pre-emit-peephole -o -  %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs -run-pass=si-pre-emit-peephole -o -  %s | FileCheck %s
 
 ---
 name: skip_branch_taildup_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
index 40be0c6b67ee9..a55b8038d255a 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -amdgpu-enable-rewrite-partial-reg-uses=false -amdgpu-dce-in-ra=0 -stress-regalloc=1 -start-before=register-coalescer -stop-after=greedy,1 -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs -amdgpu-enable-rewrite-partial-reg-uses=false -amdgpu-dce-in-ra=0 -stress-regalloc=1 -start-before=register-coalescer -stop-after=greedy,1 -o - %s | FileCheck %s
 # https://bugs.llvm.org/show_bug.cgi?id=33620
 
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/spill-partial-csr-sgpr-live-ins.mir b/llvm/test/CodeGen/AMDGPU/spill-partial-csr-sgpr-live-ins.mir
index 7b3402494f39f..400bfa54980c2 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-partial-csr-sgpr-live-ins.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-partial-csr-sgpr-live-ins.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s
 
 ---
 name: spill_partial_live_csr_sgpr_test
diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir
index 866ce8a0c0293..7b6ecd7c037c3 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s
 
 ---
 name: spill_csr_sgpr_argument
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
index 42db92b15acf5..e1e40ccf70e8f 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -verify-regalloc -run-pass=greedy %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -verify-regalloc -run-pass=greedy %s -o - | FileCheck %s
 
 ---
 name: zextload_global_v64i16_to_v64i64
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-nolivesubranges.mir b/llvm/test/CodeGen/AMDGPU/splitkit-nolivesubranges.mir
index f50f2e98d4bfa..dcac8e737cb05 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-nolivesubranges.mir
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-nolivesubranges.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=greedy,virtregrewriter -verify-regalloc %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=greedy,virtregrewriter -verify-regalloc %s -o - | FileCheck %s
 
 # This test aims to trigger live-range splitting at a place where %0 subranges
 # are all dead, but the main live-range of %0 is still alive. %0 main range is
diff --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
index 2efe27df2d10d..db9df6e2f7fdd 100644
--- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-mesa3d < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx600 < %s | FileCheck %s
 
 define i1 @test_srem_odd(i29 %X) nounwind {
 ; CHECK-LABEL: test_srem_odd:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, 0x1f5a814b
-; CHECK-NEXT:    s_mov_b32 s5, 0x52bf5b
 ; CHECK-NEXT:    v_mul_lo_u32 v0, v0, s4
+; CHECK-NEXT:    s_mov_b32 s4, 0x52bf5b
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, 0x295fad, v0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x1fffffff, v0
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s5, v0
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %srem = srem i29 %X, 99
@@ -61,36 +61,36 @@ define <3 x i1> @test_srem_vec(<3 x i31> %X) nounwind {
 ; CHECK-LABEL: test_srem_vec:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_bfe_i32 v3, v2, 0, 31
-; CHECK-NEXT:    v_bfe_i32 v4, v1, 0, 31
-; CHECK-NEXT:    v_bfe_i32 v5, v0, 0, 31
+; CHECK-NEXT:    v_bfe_i32 v3, v0, 0, 31
 ; CHECK-NEXT:    s_mov_b32 s4, 0x38e38e39
-; CHECK-NEXT:    s_mov_b32 s5, 0xc71c71c7
-; CHECK-NEXT:    s_mov_b32 s6, 0x7ffffffd
-; CHECK-NEXT:    v_mul_hi_i32 v5, v5, s4
+; CHECK-NEXT:    v_mul_hi_i32 v3, v3, s4
+; CHECK-NEXT:    v_bfe_i32 v4, v1, 0, 31
 ; CHECK-NEXT:    v_mul_hi_i32 v4, v4, s4
-; CHECK-NEXT:    v_mul_hi_i32 v3, v3, s5
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 1, v5
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 31, v4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 31, v3
+; CHECK-NEXT:    s_mov_b32 s4, 0xc71c71c7
+; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 31, v3
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; CHECK-NEXT:    v_mul_lo_u32 v5, v5, 9
-; CHECK-NEXT:    v_mul_lo_u32 v4, v4, 9
-; CHECK-NEXT:    v_mul_lo_u32 v3, v3, -9
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
-; CHECK-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
-; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_mul_lo_u32 v3, v3, 9
+; CHECK-NEXT:    v_bfe_i32 v5, v2, 0, 31
+; CHECK-NEXT:    v_mul_hi_i32 v5, v5, s4
+; CHECK-NEXT:    s_mov_b32 s4, 0x7ffffffd
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 31, v4
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
+; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 1, v5
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_mul_lo_u32 v3, v3, 9
+; CHECK-NEXT:    v_mul_lo_u32 v4, v4, -9
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v0
+; CHECK-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s6, v1
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir b/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir
index be1a8aceb8c90..d257caaa2d9b8 100644
--- a/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir
+++ b/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=early-tailduplication -verify-machineinstrs -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -passes=early-tailduplication -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -run-pass=early-tailduplication -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -passes=early-tailduplication -o - %s | FileCheck %s
 
 ---
 name:            stop_duplicate_cfg_intrinsic
diff --git a/llvm/test/CodeGen/AMDGPU/tail-dup-bundle.mir b/llvm/test/CodeGen/AMDGPU/tail-dup-bundle.mir
index 43708d32d4329..256ac06fe6f9f 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-dup-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/tail-dup-bundle.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=block-placement -tail-dup-placement-threshold=2 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=block-placement -tail-dup-placement-threshold=2 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
 # Check that tail duplication correctly counts instructions in a bundle.
 # The bundle below shall not be duplicated.
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
index f5c8cdb5ec571..23471d3fa1fb7 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 
 define amdgpu_kernel void @trunc_i64_bitcast_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
index 2d1c85e9b047c..fe1017a6add04 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -enable-var-scope -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=VI %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-regsequence.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-regsequence.mir
index f45a918d1d0f8..8cdf34d21c8ba 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-regsequence.mir
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-regsequence.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=liveintervals,twoaddressinstruction,register-coalescer -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -run-pass=liveintervals,twoaddressinstruction,register-coalescer -verify-machineinstrs -o - %s | FileCheck %s
 
 # Check that LiveIntervals are correctly updated when eliminating REG_SEQUENCE.
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem24.ll b/llvm/test/CodeGen/AMDGPU/udivrem24.ll
index d119f8400acf2..a465da4101a47 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem24.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem24.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=VI %s
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
 
@@ -14,21 +14,21 @@ define amdgpu_kernel void @udiv24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s2
 ; SI-NEXT:    s_mov_b32 s9, s3
-; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
-; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT:    v_rcp_f32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
-; SI-NEXT:    v_rcp_f32_e32 v2, v1
-; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
+; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
-; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
-; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
-; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -117,21 +117,21 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_in_out(ptr addrspace(1) %out,
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s2
 ; SI-NEXT:    s_mov_b32 s9, s3
-; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
-; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT:    v_rcp_f32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
-; SI-NEXT:    v_rcp_f32_e32 v2, v1
-; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
+; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
-; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
-; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
-; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -220,21 +220,21 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_in(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s2
 ; SI-NEXT:    s_mov_b32 s9, s3
-; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
-; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT:    v_rcp_f32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
-; SI-NEXT:    v_rcp_f32_e32 v2, v1
-; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
+; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
-; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
-; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
-; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -323,21 +323,21 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_out(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s2
 ; SI-NEXT:    s_mov_b32 s9, s3
-; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
-; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT:    v_rcp_f32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
-; SI-NEXT:    v_rcp_f32_e32 v2, v1
-; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
+; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
-; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
-; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
-; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -426,21 +426,21 @@ define amdgpu_kernel void @udiv24_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s2
 ; SI-NEXT:    s_mov_b32 s9, s3
-; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:2
+; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
+; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; SI-NEXT:    v_rcp_f32_e32 v2, v1
-; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
+; SI-NEXT:    v_rcp_f32_e32 v2, v0
+; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
-; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
-; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
-; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -523,32 +523,32 @@ define amdgpu_kernel void @udiv23_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s4, 0x7fffff
-; SI-NEXT:    s_and_b32 s5, s5, 0x7fffff
-; SI-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; SI-NEXT:    s_sub_i32 s6, 0, s5
+; SI-NEXT:    s_and_b32 s4, s3, 0x7fffff
+; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; SI-NEXT:    s_sub_i32 s3, 0, s4
+; SI-NEXT:    s_and_b32 s5, s2, 0x7fffff
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; SI-NEXT:    v_mul_lo_u32 v1, s6, v0
+; SI-NEXT:    v_mul_lo_u32 v1, s3, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_mul_hi_u32 v0, s4, v0
+; SI-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    s_mul_i32 s6, s6, s4
+; SI-NEXT:    s_sub_i32 s5, s5, s6
+; SI-NEXT:    s_sub_i32 s6, s5, s4
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_mul_i32 s6, s6, s5
-; SI-NEXT:    s_sub_i32 s4, s4, s6
-; SI-NEXT:    s_sub_i32 s6, s4, s5
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    s_cselect_b32 s4, s6, s4
+; SI-NEXT:    s_cselect_b32 s5, s6, s5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -638,32 +638,32 @@ define amdgpu_kernel void @udiv24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s4, 0xffffff
-; SI-NEXT:    s_and_b32 s5, s5, 0xffffff
-; SI-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; SI-NEXT:    s_sub_i32 s6, 0, s5
+; SI-NEXT:    s_and_b32 s4, s3, 0xffffff
+; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; SI-NEXT:    s_sub_i32 s3, 0, s4
+; SI-NEXT:    s_and_b32 s5, s2, 0xffffff
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; SI-NEXT:    v_mul_lo_u32 v1, s6, v0
+; SI-NEXT:    v_mul_lo_u32 v1, s3, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_mul_hi_u32 v0, s4, v0
+; SI-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    s_mul_i32 s6, s6, s4
+; SI-NEXT:    s_sub_i32 s5, s5, s6
+; SI-NEXT:    s_sub_i32 s6, s5, s4
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_mul_i32 s6, s6, s5
-; SI-NEXT:    s_sub_i32 s4, s4, s6
-; SI-NEXT:    s_sub_i32 s6, s4, s5
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    s_cselect_b32 s4, s6, s4
+; SI-NEXT:    s_cselect_b32 s5, s6, s5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -753,32 +753,32 @@ define amdgpu_kernel void @no_udiv24_u23_u24_i32(ptr addrspace(1) %out, ptr addr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s4, 0x7fffff
-; SI-NEXT:    s_and_b32 s5, s5, 0xffffff
-; SI-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; SI-NEXT:    s_sub_i32 s6, 0, s5
+; SI-NEXT:    s_and_b32 s4, s3, 0xffffff
+; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; SI-NEXT:    s_sub_i32 s3, 0, s4
+; SI-NEXT:    s_and_b32 s5, s2, 0x7fffff
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; SI-NEXT:    v_mul_lo_u32 v1, s6, v0
+; SI-NEXT:    v_mul_lo_u32 v1, s3, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_mul_hi_u32 v0, s4, v0
+; SI-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    s_mul_i32 s6, s6, s4
+; SI-NEXT:    s_sub_i32 s5, s5, s6
+; SI-NEXT:    s_sub_i32 s6, s5, s4
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_mul_i32 s6, s6, s5
-; SI-NEXT:    s_sub_i32 s4, s4, s6
-; SI-NEXT:    s_sub_i32 s6, s4, s5
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    s_cselect_b32 s4, s6, s4
+; SI-NEXT:    s_cselect_b32 s5, s6, s5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -868,32 +868,32 @@ define amdgpu_kernel void @no_udiv24_u24_u23_i32(ptr addrspace(1) %out, ptr addr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s4, 0xffffff
-; SI-NEXT:    s_and_b32 s5, s5, 0x7fffff
-; SI-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; SI-NEXT:    s_sub_i32 s6, 0, s5
+; SI-NEXT:    s_and_b32 s4, s3, 0x7fffff
+; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; SI-NEXT:    s_sub_i32 s3, 0, s4
+; SI-NEXT:    s_and_b32 s5, s2, 0xffffff
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; SI-NEXT:    v_mul_lo_u32 v1, s6, v0
+; SI-NEXT:    v_mul_lo_u32 v1, s3, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_mul_hi_u32 v0, s4, v0
+; SI-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    s_mul_i32 s6, s6, s4
+; SI-NEXT:    s_sub_i32 s5, s5, s6
+; SI-NEXT:    s_sub_i32 s6, s5, s4
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_mul_i32 s6, s6, s5
-; SI-NEXT:    s_sub_i32 s4, s4, s6
-; SI-NEXT:    s_sub_i32 s6, s4, s5
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    s_cselect_b32 s4, s6, s4
+; SI-NEXT:    s_cselect_b32 s5, s6, s5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -983,32 +983,32 @@ define amdgpu_kernel void @udiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s4, 0x1ffffff
-; SI-NEXT:    s_and_b32 s5, s5, 0x1ffffff
-; SI-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; SI-NEXT:    s_sub_i32 s6, 0, s5
+; SI-NEXT:    s_and_b32 s4, s3, 0x1ffffff
+; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; SI-NEXT:    s_sub_i32 s3, 0, s4
+; SI-NEXT:    s_and_b32 s5, s2, 0x1ffffff
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; SI-NEXT:    v_mul_lo_u32 v1, s6, v0
+; SI-NEXT:    v_mul_lo_u32 v1, s3, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_mul_hi_u32 v0, s4, v0
+; SI-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    s_mul_i32 s6, s6, s4
+; SI-NEXT:    s_sub_i32 s5, s5, s6
+; SI-NEXT:    s_sub_i32 s6, s5, s4
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_mul_i32 s6, s6, s5
-; SI-NEXT:    s_sub_i32 s4, s4, s6
-; SI-NEXT:    s_sub_i32 s6, s4, s5
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    s_cselect_b32 s4, s6, s4
+; SI-NEXT:    s_cselect_b32 s5, s6, s5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -1101,32 +1101,32 @@ define amdgpu_kernel void @test_no_udiv24_i32_1(ptr addrspace(1) %out, ptr addrs
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s4, 0xffffff
-; SI-NEXT:    s_and_b32 s5, s5, 0x1ffffff
-; SI-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; SI-NEXT:    s_sub_i32 s6, 0, s5
+; SI-NEXT:    s_and_b32 s4, s3, 0x1ffffff
+; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; SI-NEXT:    s_sub_i32 s3, 0, s4
+; SI-NEXT:    s_and_b32 s5, s2, 0xffffff
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; SI-NEXT:    v_mul_lo_u32 v1, s6, v0
+; SI-NEXT:    v_mul_lo_u32 v1, s3, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_mul_hi_u32 v0, s4, v0
+; SI-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    s_mul_i32 s6, s6, s4
+; SI-NEXT:    s_sub_i32 s5, s5, s6
+; SI-NEXT:    s_sub_i32 s6, s5, s4
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_mul_i32 s6, s6, s5
-; SI-NEXT:    s_sub_i32 s4, s4, s6
-; SI-NEXT:    s_sub_i32 s6, s4, s5
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    s_cselect_b32 s4, s6, s4
+; SI-NEXT:    s_cselect_b32 s5, s6, s5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -1219,32 +1219,32 @@ define amdgpu_kernel void @test_no_udiv24_i32_2(ptr addrspace(1) %out, ptr addrs
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s4, 0x1ffffff
-; SI-NEXT:    s_and_b32 s5, s5, 0xffffff
-; SI-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; SI-NEXT:    s_sub_i32 s6, 0, s5
+; SI-NEXT:    s_and_b32 s4, s3, 0xffffff
+; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; SI-NEXT:    s_sub_i32 s3, 0, s4
+; SI-NEXT:    s_and_b32 s5, s2, 0x1ffffff
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; SI-NEXT:    v_mul_lo_u32 v1, s6, v0
+; SI-NEXT:    v_mul_lo_u32 v1, s3, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_mul_hi_u32 v0, s4, v0
+; SI-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    s_mul_i32 s6, s6, s4
+; SI-NEXT:    s_sub_i32 s5, s5, s6
+; SI-NEXT:    s_sub_i32 s6, s5, s4
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_mul_i32 s6, s6, s5
-; SI-NEXT:    s_sub_i32 s4, s4, s6
-; SI-NEXT:    s_sub_i32 s6, s4, s5
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    s_cselect_b32 s4, s6, s4
+; SI-NEXT:    s_cselect_b32 s5, s6, s5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -1343,23 +1343,23 @@ define amdgpu_kernel void @urem24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s2
 ; SI-NEXT:    s_mov_b32 s9, s3
-; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
-; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
+; SI-NEXT:    v_rcp_f32_e32 v3, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
-; SI-NEXT:    v_rcp_f32_e32 v4, v3
-; SI-NEXT:    v_mul_f32_e32 v4, v2, v4
-; SI-NEXT:    v_trunc_f32_e32 v4, v4
-; SI-NEXT:    v_fma_f32 v2, -v4, v3, v2
-; SI-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
-; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
-; SI-NEXT:    v_mul_lo_u32 v1, v2, v1
-; SI-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v1
+; SI-NEXT:    v_mul_f32_e32 v3, v4, v3
+; SI-NEXT:    v_trunc_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_u32_f32_e32 v5, v3
+; SI-NEXT:    v_mad_f32 v3, -v3, v2, v4
+; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
+; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
+; SI-NEXT:    v_mul_lo_u32 v0, v2, v0
+; SI-NEXT:    v_subrev_i32_e32 v0, vcc, v0, v1
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -1452,23 +1452,23 @@ define amdgpu_kernel void @urem24_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s2
 ; SI-NEXT:    s_mov_b32 s9, s3
-; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:2
+; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
+; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; SI-NEXT:    v_rcp_f32_e32 v4, v3
-; SI-NEXT:    v_mul_f32_e32 v4, v2, v4
+; SI-NEXT:    v_rcp_f32_e32 v4, v2
+; SI-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; SI-NEXT:    v_trunc_f32_e32 v4, v4
-; SI-NEXT:    v_fma_f32 v2, -v4, v3, v2
-; SI-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
-; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
-; SI-NEXT:    v_mul_lo_u32 v1, v2, v1
-; SI-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
+; SI-NEXT:    v_cvt_u32_f32_e32 v5, v4
+; SI-NEXT:    v_mad_f32 v3, -v4, v2, v3
+; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
+; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
+; SI-NEXT:    v_mul_lo_u32 v0, v2, v0
+; SI-NEXT:    v_subrev_i32_e32 v0, vcc, v0, v1
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -1555,30 +1555,30 @@ define amdgpu_kernel void @urem24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s2, s4, 0xffffff
-; SI-NEXT:    s_and_b32 s4, s5, 0xffffff
+; SI-NEXT:    s_and_b32 s4, s3, 0xffffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; SI-NEXT:    s_sub_i32 s5, 0, s4
+; SI-NEXT:    s_sub_i32 s3, 0, s4
+; SI-NEXT:    s_and_b32 s5, s2, 0xffffff
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; SI-NEXT:    v_mul_lo_u32 v1, s5, v0
+; SI-NEXT:    v_mul_lo_u32 v1, s3, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_mul_hi_u32 v0, s2, v0
-; SI-NEXT:    v_readfirstlane_b32 s5, v0
-; SI-NEXT:    s_mul_i32 s5, s5, s4
-; SI-NEXT:    s_sub_i32 s2, s2, s5
-; SI-NEXT:    s_sub_i32 s5, s2, s4
-; SI-NEXT:    s_cmp_ge_u32 s2, s4
-; SI-NEXT:    s_cselect_b32 s2, s5, s2
-; SI-NEXT:    s_sub_i32 s5, s2, s4
-; SI-NEXT:    s_cmp_ge_u32 s2, s4
-; SI-NEXT:    s_cselect_b32 s4, s5, s2
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mul_hi_u32 v0, s5, v0
+; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    s_mul_i32 s6, s6, s4
+; SI-NEXT:    s_sub_i32 s5, s5, s6
+; SI-NEXT:    s_sub_i32 s6, s5, s4
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
+; SI-NEXT:    s_cselect_b32 s5, s6, s5
+; SI-NEXT:    s_sub_i32 s6, s5, s4
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
+; SI-NEXT:    s_cselect_b32 s4, s6, s5
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -1666,30 +1666,30 @@ define amdgpu_kernel void @urem25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s2, s4, 0x1ffffff
-; SI-NEXT:    s_and_b32 s4, s5, 0x1ffffff
+; SI-NEXT:    s_and_b32 s4, s3, 0x1ffffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; SI-NEXT:    s_sub_i32 s5, 0, s4
+; SI-NEXT:    s_sub_i32 s3, 0, s4
+; SI-NEXT:    s_and_b32 s5, s2, 0x1ffffff
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; SI-NEXT:    v_mul_lo_u32 v1, s5, v0
+; SI-NEXT:    v_mul_lo_u32 v1, s3, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_mul_hi_u32 v0, s2, v0
-; SI-NEXT:    v_readfirstlane_b32 s5, v0
-; SI-NEXT:    s_mul_i32 s5, s5, s4
-; SI-NEXT:    s_sub_i32 s2, s2, s5
-; SI-NEXT:    s_sub_i32 s5, s2, s4
-; SI-NEXT:    s_cmp_ge_u32 s2, s4
-; SI-NEXT:    s_cselect_b32 s2, s5, s2
-; SI-NEXT:    s_sub_i32 s5, s2, s4
-; SI-NEXT:    s_cmp_ge_u32 s2, s4
-; SI-NEXT:    s_cselect_b32 s4, s5, s2
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mul_hi_u32 v0, s5, v0
+; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    s_mul_i32 s6, s6, s4
+; SI-NEXT:    s_sub_i32 s5, s5, s6
+; SI-NEXT:    s_sub_i32 s6, s5, s4
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
+; SI-NEXT:    s_cselect_b32 s5, s6, s5
+; SI-NEXT:    s_sub_i32 s6, s5, s4
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
+; SI-NEXT:    s_cselect_b32 s4, s6, s5
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -1776,30 +1776,30 @@ define amdgpu_kernel void @test_no_urem24_i32_1(ptr addrspace(1) %out, ptr addrs
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s2, s4, 0xffffff
-; SI-NEXT:    s_and_b32 s4, s5, 0x1ffffff
+; SI-NEXT:    s_and_b32 s4, s3, 0x1ffffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; SI-NEXT:    s_sub_i32 s5, 0, s4
+; SI-NEXT:    s_sub_i32 s3, 0, s4
+; SI-NEXT:    s_and_b32 s5, s2, 0xffffff
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; SI-NEXT:    v_mul_lo_u32 v1, s5, v0
+; SI-NEXT:    v_mul_lo_u32 v1, s3, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_mul_hi_u32 v0, s2, v0
-; SI-NEXT:    v_readfirstlane_b32 s5, v0
-; SI-NEXT:    s_mul_i32 s5, s5, s4
-; SI-NEXT:    s_sub_i32 s2, s2, s5
-; SI-NEXT:    s_sub_i32 s5, s2, s4
-; SI-NEXT:    s_cmp_ge_u32 s2, s4
-; SI-NEXT:    s_cselect_b32 s2, s5, s2
-; SI-NEXT:    s_sub_i32 s5, s2, s4
-; SI-NEXT:    s_cmp_ge_u32 s2, s4
-; SI-NEXT:    s_cselect_b32 s4, s5, s2
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mul_hi_u32 v0, s5, v0
+; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    s_mul_i32 s6, s6, s4
+; SI-NEXT:    s_sub_i32 s5, s5, s6
+; SI-NEXT:    s_sub_i32 s6, s5, s4
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
+; SI-NEXT:    s_cselect_b32 s5, s6, s5
+; SI-NEXT:    s_sub_i32 s6, s5, s4
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
+; SI-NEXT:    s_cselect_b32 s4, s6, s5
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -1886,30 +1886,30 @@ define amdgpu_kernel void @test_no_urem24_i32_2(ptr addrspace(1) %out, ptr addrs
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s2, s4, 0x1ffffff
-; SI-NEXT:    s_and_b32 s4, s5, 0xffffff
+; SI-NEXT:    s_and_b32 s4, s3, 0xffffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; SI-NEXT:    s_sub_i32 s5, 0, s4
+; SI-NEXT:    s_sub_i32 s3, 0, s4
+; SI-NEXT:    s_and_b32 s5, s2, 0x1ffffff
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; SI-NEXT:    v_mul_lo_u32 v1, s5, v0
+; SI-NEXT:    v_mul_lo_u32 v1, s3, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_mul_hi_u32 v0, s2, v0
-; SI-NEXT:    v_readfirstlane_b32 s5, v0
-; SI-NEXT:    s_mul_i32 s5, s5, s4
-; SI-NEXT:    s_sub_i32 s2, s2, s5
-; SI-NEXT:    s_sub_i32 s5, s2, s4
-; SI-NEXT:    s_cmp_ge_u32 s2, s4
-; SI-NEXT:    s_cselect_b32 s2, s5, s2
-; SI-NEXT:    s_sub_i32 s5, s2, s4
-; SI-NEXT:    s_cmp_ge_u32 s2, s4
-; SI-NEXT:    s_cselect_b32 s4, s5, s2
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mul_hi_u32 v0, s5, v0
+; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    s_mul_i32 s6, s6, s4
+; SI-NEXT:    s_sub_i32 s5, s5, s6
+; SI-NEXT:    s_sub_i32 s6, s5, s4
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
+; SI-NEXT:    s_cselect_b32 s5, s6, s5
+; SI-NEXT:    s_sub_i32 s6, s5, s4
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
+; SI-NEXT:    s_cselect_b32 s4, s6, s5
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -1996,32 +1996,32 @@ define amdgpu_kernel void @test_udiv24_u16_u23_i32(ptr addrspace(1) %out, ptr ad
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_and_b32 s5, s5, 0x7fffff
-; SI-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; SI-NEXT:    s_sub_i32 s6, 0, s5
+; SI-NEXT:    s_and_b32 s4, s3, 0x7fffff
+; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; SI-NEXT:    s_sub_i32 s3, 0, s4
+; SI-NEXT:    s_and_b32 s5, s2, 0xffff
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; SI-NEXT:    v_mul_lo_u32 v1, s6, v0
+; SI-NEXT:    v_mul_lo_u32 v1, s3, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_mul_hi_u32 v0, s4, v0
+; SI-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    s_mul_i32 s6, s6, s4
+; SI-NEXT:    s_sub_i32 s5, s5, s6
+; SI-NEXT:    s_sub_i32 s6, s5, s4
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_mul_i32 s6, s6, s5
-; SI-NEXT:    s_sub_i32 s4, s4, s6
-; SI-NEXT:    s_sub_i32 s6, s4, s5
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    s_cselect_b32 s4, s6, s4
+; SI-NEXT:    s_cselect_b32 s5, s6, s5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -2111,32 +2111,32 @@ define amdgpu_kernel void @test_udiv24_u23_u16_i32(ptr addrspace(1) %out, ptr ad
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s4, 0x7fffff
-; SI-NEXT:    s_and_b32 s5, s5, 0xffff
-; SI-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; SI-NEXT:    s_sub_i32 s6, 0, s5
+; SI-NEXT:    s_and_b32 s4, s3, 0xffff
+; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; SI-NEXT:    s_sub_i32 s3, 0, s4
+; SI-NEXT:    s_and_b32 s5, s2, 0x7fffff
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; SI-NEXT:    v_mul_lo_u32 v1, s6, v0
+; SI-NEXT:    v_mul_lo_u32 v1, s3, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_mul_hi_u32 v0, s4, v0
+; SI-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    s_mul_i32 s6, s6, s4
+; SI-NEXT:    s_sub_i32 s5, s5, s6
+; SI-NEXT:    s_sub_i32 s6, s5, s4
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_mul_i32 s6, s6, s5
-; SI-NEXT:    s_sub_i32 s4, s4, s6
-; SI-NEXT:    s_sub_i32 s6, s4, s5
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    s_cselect_b32 s4, s6, s4
+; SI-NEXT:    s_cselect_b32 s5, s6, s5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_cmp_ge_u32 s4, s5
+; SI-NEXT:    s_cmp_ge_u32 s5, s4
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index bf57f65a570cf..c2ef2666133b5 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
@@ -10,21 +10,19 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i
 ; GFX6-LABEL: s_uint_to_fp_i64_to_f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    s_flbit_i32_b32 s0, s3
-; GFX6-NEXT:    s_min_u32 s8, s0, 32
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[2:3], s8
-; GFX6-NEXT:    s_min_u32 s0, s0, 1
-; GFX6-NEXT:    s_or_b32 s0, s1, s0
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GFX6-NEXT:    s_sub_i32 s0, 32, s8
-; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s0
+; GFX6-NEXT:    s_flbit_i32_b32 s4, s3
+; GFX6-NEXT:    s_min_u32 s4, s4, 32
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX6-NEXT:    s_min_u32 s2, s2, 1
+; GFX6-NEXT:    s_or_b32 s2, s3, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT:    s_sub_i32 s2, 32, s4
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_uint_to_fp_i64_to_f16:
@@ -104,19 +102,19 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_ffbh_u32_e32 v0, v4
-; GFX6-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
+; GFX6-NEXT:    v_ffbh_u32_e32 v1, v4
+; GFX6-NEXT:    v_min_u32_e32 v1, 32, v1
+; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v1
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
 ; GFX6-NEXT:    v_min_u32_e32 v3, 1, v3
 ; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 32, v0
-; GFX6-NEXT:    v_ldexp_f32_e32 v0, v3, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    buffer_store_short v0, v[1:2], s[0:3], 0 addr64
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v3, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX6-NEXT:    buffer_store_short v3, v[1:2], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: v_uint_to_fp_i64_to_f16:
@@ -210,15 +208,15 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_flbit_i32_b32 s4, s3
+; GFX6-NEXT:    s_min_u32 s8, s4, 32
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX6-NEXT:    s_min_u32 s2, s2, 1
+; GFX6-NEXT:    s_or_b32 s2, s3, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    s_flbit_i32_b32 s0, s3
-; GFX6-NEXT:    s_min_u32 s8, s0, 32
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[2:3], s8
-; GFX6-NEXT:    s_min_u32 s0, s0, 1
-; GFX6-NEXT:    s_or_b32 s0, s1, s0
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
 ; GFX6-NEXT:    s_sub_i32 s0, 32, s8
+; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -275,17 +273,17 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_ffbh_u32_e32 v0, v4
-; GFX6-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
-; GFX6-NEXT:    v_min_u32_e32 v3, 1, v3
-; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 32, v0
-; GFX6-NEXT:    v_ldexp_f32_e32 v0, v3, v0
+; GFX6-NEXT:    v_ffbh_u32_e32 v1, v4
+; GFX6-NEXT:    v_min_u32_e32 v5, 32, v1
+; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT:    v_min_u32_e32 v0, 1, v3
+; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 32, v5
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v3
 ; GFX6-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -355,21 +353,21 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_flbit_i32_b32 s8, s3
-; GFX6-NEXT:    s_flbit_i32_b32 s9, s1
 ; GFX6-NEXT:    s_min_u32 s8, s8, 32
-; GFX6-NEXT:    s_min_u32 s9, s9, 32
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; GFX6-NEXT:    s_sub_i32 s8, 32, s8
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX6-NEXT:    s_sub_i32 s9, 32, s9
 ; GFX6-NEXT:    s_min_u32 s2, s2, 1
-; GFX6-NEXT:    s_min_u32 s0, s0, 1
+; GFX6-NEXT:    s_flbit_i32_b32 s9, s1
 ; GFX6-NEXT:    s_or_b32 s2, s3, s2
-; GFX6-NEXT:    s_or_b32 s0, s1, s0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT:    s_min_u32 s2, s9, 32
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; GFX6-NEXT:    s_min_u32 s0, s0, 1
+; GFX6-NEXT:    s_or_b32 s0, s1, s0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s0
-; GFX6-NEXT:    v_ldexp_f32_e64 v1, v0, s8
-; GFX6-NEXT:    v_ldexp_f32_e64 v0, v2, s9
+; GFX6-NEXT:    s_sub_i32 s0, 32, s8
+; GFX6-NEXT:    v_ldexp_f32_e64 v1, v0, s0
+; GFX6-NEXT:    s_sub_i32 s0, 32, s2
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, v2, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -458,11 +456,8 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
 ; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
 ; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, 32, v0
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], v9
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 32, v9
 ; GFX6-NEXT:    v_lshl_b64 v[7:8], v[7:8], v12
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 32, v12
 ; GFX6-NEXT:    v_lshl_b64 v[5:6], v[5:6], v13
-; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, 32, v13
 ; GFX6-NEXT:    v_min_u32_e32 v3, 1, v3
 ; GFX6-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_min_u32_e32 v7, 1, v7
@@ -475,6 +470,9 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 32, v9
+; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 32, v12
+; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, 32, v13
 ; GFX6-NEXT:    v_ldexp_f32_e32 v3, v3, v14
 ; GFX6-NEXT:    v_ldexp_f32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v9
@@ -599,28 +597,28 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_flbit_i32_b32 s8, s3
-; GFX6-NEXT:    s_flbit_i32_b32 s9, s1
-; GFX6-NEXT:    s_min_u32 s8, s8, 32
-; GFX6-NEXT:    s_min_u32 s9, s9, 32
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; GFX6-NEXT:    s_sub_i32 s8, 32, s8
-; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX6-NEXT:    s_sub_i32 s9, 32, s9
+; GFX6-NEXT:    s_flbit_i32_b32 s6, s3
+; GFX6-NEXT:    s_flbit_i32_b32 s7, s1
+; GFX6-NEXT:    s_min_u32 s6, s6, 32
+; GFX6-NEXT:    s_min_u32 s7, s7, 32
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
 ; GFX6-NEXT:    s_min_u32 s2, s2, 1
-; GFX6-NEXT:    s_min_u32 s0, s0, 1
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
 ; GFX6-NEXT:    s_or_b32 s2, s3, s2
-; GFX6-NEXT:    s_or_b32 s0, s1, s0
+; GFX6-NEXT:    s_min_u32 s0, s0, 1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT:    s_or_b32 s0, s1, s0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s8
-; GFX6-NEXT:    v_ldexp_f32_e64 v1, v1, s9
+; GFX6-NEXT:    s_sub_i32 s6, 32, s6
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s6
+; GFX6-NEXT:    s_sub_i32 s0, 32, s7
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_ldexp_f32_e64 v1, v1, s0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -748,11 +746,8 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
 ; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
 ; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, 32, v0
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], v9
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 32, v9
 ; GFX6-NEXT:    v_lshl_b64 v[7:8], v[7:8], v12
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 32, v12
 ; GFX6-NEXT:    v_lshl_b64 v[5:6], v[5:6], v13
-; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, 32, v13
 ; GFX6-NEXT:    v_min_u32_e32 v3, 1, v3
 ; GFX6-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_min_u32_e32 v7, 1, v7
@@ -762,16 +757,19 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
 ; GFX6-NEXT:    v_or_b32_e32 v1, v8, v7
 ; GFX6-NEXT:    v_or_b32_e32 v4, v6, v5
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 32, v9
+; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 32, v12
+; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, 32, v13
 ; GFX6-NEXT:    v_ldexp_f32_e32 v3, v3, v14
-; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v9
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_ldexp_f32_e32 v2, v4, v12
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
index 31708a9b738db..237ff114b50ab 100644
--- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -early-live-intervals < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -early-live-intervals < %s | FileCheck %s
 
 ; We may have subregister live ranges that are undefined on some paths. The
 ; verifier should not complain about this.
@@ -106,9 +106,9 @@ define amdgpu_kernel void @partially_undef_copy() #0 {
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v6
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v7
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v8
+; CHECK-NEXT:    v_mov_b32_e32 v0, v6
 ; CHECK-NEXT:    s_mov_b32 s3, 0xf000
 ; CHECK-NEXT:    s_mov_b32 s2, -1
-; CHECK-NEXT:    v_mov_b32_e32 v0, v6
 ; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index 781de08ea4496..e12028f674d73 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn-amdhsa -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=gfx700 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
 ; RUN: opt -S -si-annotate-control-flow -mtriple=amdgcn-amdhsa -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI-OPT %s
 
 define hidden void @widget() #0 {
@@ -35,10 +35,10 @@ define hidden void @widget() #0 {
 ; GCN-NEXT:    flat_load_dword v0, v[0:1]
 ; GCN-NEXT:    s_mov_b64 s[20:21], -1
 ; GCN-NEXT:    s_mov_b64 s[16:17], 0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc, 21, v0
 ; GCN-NEXT:    s_mov_b64 s[54:55], 0
 ; GCN-NEXT:    s_mov_b64 s[18:19], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_gt_i32_e32 vcc, 21, v0
 ; GCN-NEXT:    s_cbranch_vccz .LBB0_9
 ; GCN-NEXT:  ; %bb.1: ; %Flow
 ; GCN-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
@@ -60,6 +60,9 @@ define hidden void @widget() #0 {
 ; GCN-NEXT:    s_mov_b32 s53, s15
 ; GCN-NEXT:    v_mov_b32_e32 v40, v31
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
+; GCN-NEXT:    s_andn2_b64 s[18:19], s[54:55], exec
+; GCN-NEXT:    s_and_b64 s[20:21], vcc, exec
 ; GCN-NEXT:    v_mov_b32_e32 v31, v40
 ; GCN-NEXT:    s_mov_b32 s12, s50
 ; GCN-NEXT:    s_mov_b32 s13, s51
@@ -69,19 +72,16 @@ define hidden void @widget() #0 {
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[38:39]
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[48:49]
-; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
 ; GCN-NEXT:    s_mov_b64 s[16:17], 0
-; GCN-NEXT:    s_andn2_b64 s[18:19], s[54:55], exec
-; GCN-NEXT:    s_and_b64 s[20:21], vcc, exec
 ; GCN-NEXT:    s_or_b64 s[54:55], s[18:19], s[20:21]
 ; GCN-NEXT:  .LBB0_4: ; %Flow2
 ; GCN-NEXT:    s_and_saveexec_b64 s[18:19], s[54:55]
 ; GCN-NEXT:    s_xor_b64 s[18:19], exec, s[18:19]
 ; GCN-NEXT:    s_cbranch_execz .LBB0_6
 ; GCN-NEXT:  ; %bb.5: ; %bb12
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:  .LBB0_6: ; %Flow3
 ; GCN-NEXT:    s_or_b64 exec, exec, s[18:19]
@@ -93,6 +93,7 @@ define hidden void @widget() #0 {
 ; GCN-NEXT:    s_addc_u32 s17, s17, wibble at rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:  .LBB0_8: ; %UnifiedReturnBlock
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    v_readlane_b32 s30, v41, 14
 ; GCN-NEXT:    v_readlane_b32 s31, v41, 15
 ; GCN-NEXT:    v_readlane_b32 s55, v41, 13
@@ -109,7 +110,6 @@ define hidden void @widget() #0 {
 ; GCN-NEXT:    v_readlane_b32 s36, v41, 2
 ; GCN-NEXT:    v_readlane_b32 s35, v41, 1
 ; GCN-NEXT:    v_readlane_b32 s34, v41, 0
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v41, 16
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
@@ -121,7 +121,6 @@ define hidden void @widget() #0 {
 ; GCN-NEXT:  .LBB0_9: ; %bb2
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[54:55], 21, v0
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[18:19], 21, v0
-; GCN-NEXT:    s_mov_b64 vcc, exec
 ; GCN-NEXT:    s_cbranch_execnz .LBB0_2
 ; GCN-NEXT:  .LBB0_10: ; %bb4
 ; GCN-NEXT:    s_mov_b64 s[16:17], -1
@@ -292,7 +291,11 @@ define hidden void @blam() #0 {
 ; GCN-NEXT:    v_writelane_b32 v45, s81, 23
 ; GCN-NEXT:    v_writelane_b32 v45, s30, 24
 ; GCN-NEXT:    v_writelane_b32 v45, s31, 25
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    flat_load_dword v43, v[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v40, v31
+; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v40
 ; GCN-NEXT:    s_mov_b32 s54, s15
 ; GCN-NEXT:    s_mov_b32 s55, s14
 ; GCN-NEXT:    s_mov_b32 s64, s13
@@ -301,17 +304,13 @@ define hidden void @blam() #0 {
 ; GCN-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[48:49], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    v_and_b32_e32 v2, 0x3ff, v40
-; GCN-NEXT:    flat_load_dword v43, v[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v42, 0
 ; GCN-NEXT:    s_mov_b64 s[66:67], 0
-; GCN-NEXT:    v_lshlrev_b32_e32 v41, 2, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v41, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v44, 0x7fc00000
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_f32_e64 s[68:69], 0, v43
 ; GCN-NEXT:    v_cmp_neq_f32_e64 s[50:51], 0, v43
-; GCN-NEXT:    v_mov_b32_e32 v44, 0x7fc00000
 ; GCN-NEXT:    s_branch .LBB1_2
 ; GCN-NEXT:  .LBB1_1: ; %Flow7
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
@@ -323,11 +322,11 @@ define hidden void @blam() #0 {
 ; GCN-NEXT:  .LBB1_2: ; %bb2
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    flat_load_dword v0, v[41:42]
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], 0
 ; GCN-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-NEXT:    s_mov_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 2, v0
-; GCN-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GCN-NEXT:    s_xor_b64 s[70:71], exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_execz .LBB1_12
@@ -362,8 +361,8 @@ define hidden void @blam() #0 {
 ; GCN-NEXT:    s_cbranch_execz .LBB1_7
 ; GCN-NEXT:  ; %bb.6: ; %bb16
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], 0
 ; GCN-NEXT:    s_or_b64 s[8:9], s[68:69], exec
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], 0
 ; GCN-NEXT:  .LBB1_7: ; %Flow3
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
@@ -404,8 +403,8 @@ define hidden void @blam() #0 {
 ; GCN-NEXT:    s_cbranch_execz .LBB1_15
 ; GCN-NEXT:  ; %bb.14: ; %bb10
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], 0
 ; GCN-NEXT:    s_or_b64 s[10:11], s[6:7], exec
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], 0
 ; GCN-NEXT:  .LBB1_15: ; %Flow6
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[12:13]
@@ -422,11 +421,16 @@ define hidden void @blam() #0 {
 ; GCN-NEXT:    s_cbranch_execz .LBB1_1
 ; GCN-NEXT:  ; %bb.17: ; %bb18
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], 0
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], 0
 ; GCN-NEXT:    s_branch .LBB1_1
 ; GCN-NEXT:  .LBB1_18: ; %DummyReturnBlock
 ; GCN-NEXT:    s_or_b64 exec, exec, s[66:67]
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GCN-NEXT:    v_readlane_b32 s30, v45, 24
 ; GCN-NEXT:    v_readlane_b32 s31, v45, 25
 ; GCN-NEXT:    v_readlane_b32 s81, v45, 23
@@ -453,11 +457,6 @@ define hidden void @blam() #0 {
 ; GCN-NEXT:    v_readlane_b32 s36, v45, 2
 ; GCN-NEXT:    v_readlane_b32 s35, v45, 1
 ; GCN-NEXT:    v_readlane_b32 s34, v45, 0
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v45, 26
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
index a0dd0e7e78f9d..1c23a1bac9ec8 100644
--- a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-mesa3d < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx600 < %s | FileCheck %s
 
 define i1 @test_urem_odd(i13 %X) nounwind {
 ; CHECK-LABEL: test_urem_odd:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x1fff, v0
-; CHECK-NEXT:    s_movk_i32 s4, 0x667
 ; CHECK-NEXT:    v_mul_u32_u24_e32 v0, 0xccd, v0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x1fff, v0
+; CHECK-NEXT:    s_movk_i32 s4, 0x667
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -22,13 +22,13 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, 0x6db6db7
-; CHECK-NEXT:    s_mov_b32 s5, 0x924925
 ; CHECK-NEXT:    v_mul_lo_u32 v0, v0, s4
+; CHECK-NEXT:    s_mov_b32 s4, 0x924925
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 26, v0
 ; CHECK-NEXT:    v_bfe_u32 v0, v0, 1, 26
 ; CHECK-NEXT:    v_or_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x7ffffff, v0
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s5, v0
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem i27 %X, 14
@@ -70,26 +70,26 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; CHECK-LABEL: test_urem_vec:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x7ff, v0
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7ff, v1
+; CHECK-NEXT:    s_mov_b32 s5, 0xb6db6db7
 ; CHECK-NEXT:    v_and_b32_e32 v2, 0x7ff, v2
 ; CHECK-NEXT:    s_mov_b32 s4, 0x8311eb33
-; CHECK-NEXT:    s_mov_b32 s5, 0x20140c
-; CHECK-NEXT:    s_mov_b32 s6, 0xb6db6db7
-; CHECK-NEXT:    s_mov_b32 s7, 0x24924924
-; CHECK-NEXT:    s_mov_b32 s8, 0xaaaaaaab
-; CHECK-NEXT:    s_mov_b32 s9, 0x2aaaaaaa
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x7ff, v0
+; CHECK-NEXT:    v_mul_lo_u32 v1, v1, s5
+; CHECK-NEXT:    s_mov_b32 s5, 0xaaaaaaab
 ; CHECK-NEXT:    v_mul_lo_u32 v2, v2, s4
-; CHECK-NEXT:    v_mul_lo_u32 v1, v1, s6
-; CHECK-NEXT:    v_mul_lo_u32 v0, v0, s8
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 0xf9dc299a, v2
+; CHECK-NEXT:    v_mul_lo_u32 v0, v0, s5
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 0x49249249, v1
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 0xf9dc299a, v2
 ; CHECK-NEXT:    v_alignbit_b32 v0, v0, v0, 1
-; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v0
+; CHECK-NEXT:    s_mov_b32 s6, 0x2aaaaaaa
+; CHECK-NEXT:    s_mov_b32 s5, 0x24924924
+; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0x20140c
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s7, v1
+; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v2
+; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem <3 x i11> %X, <i11 6, i11 7, i11 -5>
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index 23172eb2d8158..a445d1e25d49a 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
@@ -16,22 +16,23 @@ declare double @llvm.fabs.f64(double)
 define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
 ; SI-LABEL: v_cnd_nan_nosgpr:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    s_mov_b32 s7, s3
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s3, s7
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_cmp_eq_u32 s8, 0
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_cmp_eq_u32 s0, 0
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_cnd_nan_nosgpr:
@@ -132,11 +133,11 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -516,7 +517,6 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o
 define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 {
 ; SI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
@@ -524,11 +524,12 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_nlg_f32_e64 vcc, s6, 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
@@ -618,7 +619,6 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o
 define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 {
 ; SI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
@@ -626,11 +626,12 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_nlg_f32_e64 vcc, s6, 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
@@ -725,12 +726,12 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v3, s8
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v2
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
@@ -814,19 +815,19 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o
 ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    v_cmp_le_f32_e32 vcc, 0, v2
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -924,19 +925,19 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o
 ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 2, v3, vcc
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -1034,19 +1035,19 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
 ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[8:11], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v3
 ; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 2, v4, vcc
@@ -1149,21 +1150,21 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1)
 ; SI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
 ; SI-NEXT:    v_mov_b32_e32 v5, v2
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
+; SI-NEXT:    buffer_load_dword v6, v[1:2], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 4.0, v6
 ; SI-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
@@ -1280,21 +1281,21 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1)
 ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
 ; SI-NEXT:    v_mov_b32_e32 v5, v2
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
+; SI-NEXT:    buffer_load_dword v6, v[1:2], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    v_cmp_ge_f32_e32 vcc, 4.0, v6
 ; SI-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
@@ -1413,21 +1414,21 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1)
 ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
 ; SI-NEXT:    v_mov_b32_e32 v5, v2
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
+; SI-NEXT:    buffer_load_dword v6, v[1:2], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    v_cmp_le_f32_e32 vcc, 4.0, v6
 ; SI-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
@@ -1548,18 +1549,18 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[10:11]
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[10:11]
 ; SI-NEXT:    buffer_load_dword v2, v[2:3], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; SI-NEXT:    v_and_b32_e32 v3, 1, v3
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
+; SI-NEXT:    v_and_b32_e32 v3, 1, v3
 ; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v3
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
@@ -1673,22 +1674,22 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1)
 ; SI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
 ; SI-NEXT:    v_mov_b32_e32 v4, v2
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc
+; SI-NEXT:    buffer_load_dword v2, v[1:2], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[3:4], s[8:11], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x3ff00000
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    v_cmp_le_f32_e32 vcc, 0, v2
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
@@ -1797,21 +1798,21 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1)
 ; SI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
 ; SI-NEXT:    v_mov_b32_e32 v4, v2
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc
+; SI-NEXT:    buffer_load_dword v2, v[1:2], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[3:4], s[8:11], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v2
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
@@ -1919,19 +1920,19 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1)
 ; SI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v2
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 4.0, v3, vcc
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -2030,19 +2031,19 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add
 ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    v_cmp_nle_f32_e32 vcc, 4.0, v2
 ; SI-NEXT:    v_cndmask_b32_e64 v2, v3, -1.0, vcc
 ; SI-NEXT:    v_cndmask_b32_e64 v3, v3, -2.0, vcc
@@ -2162,24 +2163,24 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add
 define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
 ; SI-LABEL: v_cndmask_abs_neg_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s3, s7
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b32 s3, s7
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_cmp_lg_u32 s8, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_cmp_lg_u32 s0, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; SI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -2328,22 +2329,23 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c,
 define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
 ; SI-LABEL: v_cndmask_abs_neg_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    s_mov_b32 s7, s3
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s3, s7
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_cmp_lg_u32 s8, 0
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_cmp_lg_u32 s0, 0
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cndmask_b32_e64 v0, -v0, |v0|, s[4:5]
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1]
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_cndmask_abs_neg_f32:
@@ -2437,24 +2439,24 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c,
 define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
 ; SI-LABEL: v_cndmask_abs_neg_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s3, s7
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b32 s3, s7
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_cmp_lg_u32 s8, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_cmp_lg_u32 s0, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
 ; SI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;



More information about the llvm-branch-commits mailing list