[llvm] [DAG] visitFREEZE - replace multiple frozen/unfrozen uses of an SDValue with just the frozen node (PR #150017)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 31 07:23:47 PDT 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/150017
>From f9eadbc5050104c7bdab5ec8597f0aeac83abc26 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 22 Jul 2025 14:12:52 +0100
Subject: [PATCH] [DAG] visitFREEZE - replace multiple frozen/unfrozen uses of
an SDValue with just the frozen node
Similar to InstCombinerImpl::freezeOtherUses, attempt to ensure that we merge multiple frozen/unfrozen uses of an SDValue. This fixes a number of hasOneUse() problems when trying to push FREEZE nodes through the DAG.
Remove SimplifyMultipleUseDemandedBits handling of FREEZE nodes as we now want to keep the common node, and not bypass for some nodes just because of DemandedElts.
Fixes #149799
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 20 +-
.../CodeGen/SelectionDAG/TargetLowering.cpp | 7 -
llvm/test/CodeGen/AArch64/midpoint-int.ll | 18 +-
llvm/test/CodeGen/AMDGPU/div_i128.ll | 10 +-
llvm/test/CodeGen/AMDGPU/freeze.ll | 75 +-
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 357 ++--
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 754 +++----
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 1816 +++++++++--------
llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 548 ++---
llvm/test/CodeGen/AMDGPU/load-global-i8.ll | 1565 +++++++-------
llvm/test/CodeGen/AMDGPU/load-local-i16.ll | 40 +-
llvm/test/CodeGen/AMDGPU/rem_i128.ll | 10 +-
llvm/test/CodeGen/AMDGPU/sdiv.ll | 784 +++----
.../AMDGPU/sext-in-reg-vector-shuffle.ll | 21 +-
llvm/test/CodeGen/NVPTX/i1-select.ll | 4 +-
llvm/test/CodeGen/RISCV/abds.ll | 216 +-
llvm/test/CodeGen/RISCV/fpclamptosat.ll | 208 +-
llvm/test/CodeGen/RISCV/iabs.ll | 80 +-
llvm/test/CodeGen/RISCV/rv32zbb.ll | 56 +-
llvm/test/CodeGen/RISCV/rv32zbs.ll | 12 +-
.../CodeGen/RISCV/rvv/fpclamptosat_vec.ll | 340 +--
.../CodeGen/RISCV/rvv/vec3-setcc-crash.ll | 42 +-
llvm/test/CodeGen/RISCV/rvv/vp-splice.ll | 136 +-
llvm/test/CodeGen/VE/Scalar/min.ll | 16 +-
llvm/test/CodeGen/X86/combine-sdiv.ll | 12 +-
llvm/test/CodeGen/X86/freeze-binary.ll | 22 +-
llvm/test/CodeGen/X86/freeze-vector.ll | 8 +-
llvm/test/CodeGen/X86/midpoint-int-vec-256.ll | 422 ++--
llvm/test/CodeGen/X86/midpoint-int-vec-512.ll | 160 +-
llvm/test/CodeGen/X86/midpoint-int.ll | 357 ++--
llvm/test/CodeGen/X86/oddsubvector.ll | 12 +-
llvm/test/CodeGen/X86/pr38539.ll | 2 +-
llvm/test/CodeGen/X86/vector-compress.ll | 976 ++++-----
llvm/test/CodeGen/X86/vector-fshl-rot-128.ll | 26 +-
llvm/test/CodeGen/X86/vector-fshl-rot-256.ll | 16 +-
llvm/test/CodeGen/X86/vector-fshr-rot-128.ll | 26 +-
llvm/test/CodeGen/X86/vector-fshr-rot-256.ll | 8 +-
llvm/test/CodeGen/X86/vector-rotate-128.ll | 26 +-
llvm/test/CodeGen/X86/vector-rotate-256.ll | 16 +-
39 files changed, 4605 insertions(+), 4619 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 20b96f5e1bc00..59808db5458e4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4068,18 +4068,11 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
unsigned BitWidth = VT.getScalarSizeInBits();
SDLoc DL(N);
- auto PeekThroughFreeze = [](SDValue N) {
- if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
- return N->getOperand(0);
- return N;
- };
-
if (SDValue V = foldSubCtlzNot<EmptyMatchContext>(N, DAG))
return V;
// fold (sub x, x) -> 0
- // FIXME: Refactor this and xor and other similar operations together.
- if (PeekThroughFreeze(N0) == PeekThroughFreeze(N1))
+ if (N0 == N1)
return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
// fold (sub c1, c2) -> c3
@@ -16735,6 +16728,17 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, /*PoisonOnly*/ false))
return N0;
+ // If we have frozen and unfrozen users of N0, update so everything uses N.
+ if (!N0.isUndef() && !N0.hasOneUse()) {
+ SDValue FrozenN0 = SDValue(N, 0);
+ DAG.ReplaceAllUsesOfValueWith(N0, FrozenN0);
+ // ReplaceAllUsesOfValueWith will have also updated the use in N, thus
+ // creating a cycle in a DAG. Let's undo that by mutating the freeze.
+ assert(N->getOperand(0) == FrozenN0 && "Expected cycle in DAG");
+ DAG.UpdateNodeOperands(N, N0);
+ return FrozenN0;
+ }
+
// We currently avoid folding freeze over SRA/SRL, due to the problems seen
// with (freeze (assert ext)) blocking simplifications of SRA/SRL. See for
// example https://reviews.llvm.org/D136529#4120959.
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1764910861df4..0df0d5a479385 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -775,13 +775,6 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
break;
}
- case ISD::FREEZE: {
- SDValue N0 = Op.getOperand(0);
- if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
- /*PoisonOnly=*/false, Depth + 1))
- return N0;
- break;
- }
case ISD::AND: {
LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
diff --git a/llvm/test/CodeGen/AArch64/midpoint-int.ll b/llvm/test/CodeGen/AArch64/midpoint-int.ll
index bbdce7c6e933b..a8993e3542cfb 100644
--- a/llvm/test/CodeGen/AArch64/midpoint-int.ll
+++ b/llvm/test/CodeGen/AArch64/midpoint-int.ll
@@ -61,11 +61,10 @@ define i32 @scalar_i32_signed_mem_reg(ptr %a1_addr, i32 %a2) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldr w9, [x0]
; CHECK-NEXT: mov w8, #-1 // =0xffffffff
-; CHECK-NEXT: cmp w9, w1
; CHECK-NEXT: sub w10, w1, w9
-; CHECK-NEXT: cneg w8, w8, le
; CHECK-NEXT: subs w11, w9, w1
; CHECK-NEXT: csel w10, w11, w10, gt
+; CHECK-NEXT: cneg w8, w8, le
; CHECK-NEXT: lsr w10, w10, #1
; CHECK-NEXT: madd w0, w10, w8, w9
; CHECK-NEXT: ret
@@ -86,11 +85,10 @@ define i32 @scalar_i32_signed_reg_mem(i32 %a1, ptr %a2_addr) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldr w9, [x1]
; CHECK-NEXT: mov w8, #-1 // =0xffffffff
-; CHECK-NEXT: cmp w0, w9
; CHECK-NEXT: sub w10, w9, w0
-; CHECK-NEXT: cneg w8, w8, le
; CHECK-NEXT: subs w9, w0, w9
; CHECK-NEXT: csel w9, w9, w10, gt
+; CHECK-NEXT: cneg w8, w8, le
; CHECK-NEXT: lsr w9, w9, #1
; CHECK-NEXT: madd w0, w9, w8, w0
; CHECK-NEXT: ret
@@ -112,11 +110,10 @@ define i32 @scalar_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
; CHECK-NEXT: ldr w9, [x0]
; CHECK-NEXT: ldr w10, [x1]
; CHECK-NEXT: mov w8, #-1 // =0xffffffff
-; CHECK-NEXT: cmp w9, w10
; CHECK-NEXT: sub w11, w10, w9
-; CHECK-NEXT: cneg w8, w8, le
; CHECK-NEXT: subs w10, w9, w10
; CHECK-NEXT: csel w10, w10, w11, gt
+; CHECK-NEXT: cneg w8, w8, le
; CHECK-NEXT: lsr w10, w10, #1
; CHECK-NEXT: madd w0, w10, w8, w9
; CHECK-NEXT: ret
@@ -190,11 +187,10 @@ define i64 @scalar_i64_signed_mem_reg(ptr %a1_addr, i64 %a2) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldr x9, [x0]
; CHECK-NEXT: mov x8, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: cmp x9, x1
; CHECK-NEXT: sub x10, x1, x9
-; CHECK-NEXT: cneg x8, x8, le
; CHECK-NEXT: subs x11, x9, x1
; CHECK-NEXT: csel x10, x11, x10, gt
+; CHECK-NEXT: cneg x8, x8, le
; CHECK-NEXT: lsr x10, x10, #1
; CHECK-NEXT: madd x0, x10, x8, x9
; CHECK-NEXT: ret
@@ -215,11 +211,10 @@ define i64 @scalar_i64_signed_reg_mem(i64 %a1, ptr %a2_addr) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldr x9, [x1]
; CHECK-NEXT: mov x8, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: cmp x0, x9
; CHECK-NEXT: sub x10, x9, x0
-; CHECK-NEXT: cneg x8, x8, le
; CHECK-NEXT: subs x9, x0, x9
; CHECK-NEXT: csel x9, x9, x10, gt
+; CHECK-NEXT: cneg x8, x8, le
; CHECK-NEXT: lsr x9, x9, #1
; CHECK-NEXT: madd x0, x9, x8, x0
; CHECK-NEXT: ret
@@ -241,11 +236,10 @@ define i64 @scalar_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
; CHECK-NEXT: ldr x9, [x0]
; CHECK-NEXT: ldr x10, [x1]
; CHECK-NEXT: mov x8, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: cmp x9, x10
; CHECK-NEXT: sub x11, x10, x9
-; CHECK-NEXT: cneg x8, x8, le
; CHECK-NEXT: subs x10, x9, x10
; CHECK-NEXT: csel x10, x10, x11, gt
+; CHECK-NEXT: cneg x8, x8, le
; CHECK-NEXT: lsr x10, x10, #1
; CHECK-NEXT: madd x0, x10, x8, x9
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index e6c38d29be949..55067023116f0 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -495,8 +495,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1
; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9]
-; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1
-; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-O0-NEXT: s_mov_b64 s[14:15], -1
+; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
@@ -2679,8 +2680,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1
; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9]
-; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1
-; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-O0-NEXT: s_mov_b64 s[14:15], -1
+; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll
index ac4f0df7506ae..308e86bbaf8fd 100644
--- a/llvm/test/CodeGen/AMDGPU/freeze.ll
+++ b/llvm/test/CodeGen/AMDGPU/freeze.ll
@@ -5692,10 +5692,6 @@ define void @freeze_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX6-SDAG-NEXT: s_mov_b32 s5, s6
; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX6-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v4
; GFX6-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
; GFX6-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -5725,10 +5721,6 @@ define void @freeze_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX7-SDAG-NEXT: s_mov_b32 s5, s6
; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4
; GFX7-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
; GFX7-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6351,10 +6343,6 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX6-SDAG-NEXT: s_mov_b32 s5, s6
; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX6-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0
-; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v4, v0
; GFX6-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
; GFX6-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -6384,10 +6372,6 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX7-SDAG-NEXT: s_mov_b32 s5, s6
; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0
-; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v4, v0
; GFX7-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
; GFX7-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -12347,14 +12331,9 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX6-SDAG-NEXT: s_mov_b32 s5, s6
; GFX6-SDAG-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v4, 8, v0
-; GFX6-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX6-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX6-SDAG-NEXT: buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2
; GFX6-SDAG-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT: buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -12392,14 +12371,9 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX7-SDAG-NEXT: s_mov_b32 s5, s6
; GFX7-SDAG-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 8, v0
-; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX7-SDAG-NEXT: buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2
; GFX7-SDAG-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT: buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -12474,11 +12448,7 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: global_load_dword v0, v[0:1], off
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT: v_lshrrev_b16 v1, 8, v0
-; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-SDAG-NEXT: global_store_byte v[2:3], v4, off offset:2
+; GFX10-SDAG-NEXT: global_store_byte_d16_hi v[2:3], v0, off offset:2
; GFX10-SDAG-NEXT: global_store_short v[2:3], v0, off
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -12499,36 +12469,15 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX10-GISEL-NEXT: global_store_byte_d16_hi v[2:3], v0, off offset:2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: freeze_v3i8:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_b32 v1, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b16 v0.l, 8, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_clause 0x1
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[2:3], v4, off offset:2
-; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: freeze_v3i8:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b16 v1, 8, v0
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v0
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v1, v4, v1
-; GFX11-SDAG-FAKE16-NEXT: s_clause 0x1
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[2:3], v0, off offset:2
-; GFX11-SDAG-FAKE16-NEXT: global_store_b16 v[2:3], v1, off
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: freeze_v3i8:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_d16_hi_b8 v[2:3], v0, off offset:2
+; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: freeze_v3i8:
; GFX11-GISEL: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index bfc01ef138721..d59f72ad7a1ac 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -8343,53 +8343,53 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshr_b32 s42, s5, 30
-; GFX6-NEXT: s_lshr_b32 s36, s5, 28
-; GFX6-NEXT: s_lshr_b32 s38, s5, 29
-; GFX6-NEXT: s_lshr_b32 s30, s5, 26
-; GFX6-NEXT: s_lshr_b32 s34, s5, 27
-; GFX6-NEXT: s_lshr_b32 s26, s5, 24
-; GFX6-NEXT: s_lshr_b32 s28, s5, 25
-; GFX6-NEXT: s_lshr_b32 s22, s5, 22
-; GFX6-NEXT: s_lshr_b32 s24, s5, 23
-; GFX6-NEXT: s_lshr_b32 s18, s5, 20
-; GFX6-NEXT: s_lshr_b32 s20, s5, 21
-; GFX6-NEXT: s_lshr_b32 s14, s5, 18
-; GFX6-NEXT: s_lshr_b32 s16, s5, 19
-; GFX6-NEXT: s_lshr_b32 s10, s5, 16
-; GFX6-NEXT: s_lshr_b32 s12, s5, 17
-; GFX6-NEXT: s_lshr_b32 s6, s5, 14
-; GFX6-NEXT: s_lshr_b32 s8, s5, 15
-; GFX6-NEXT: s_mov_b32 s40, s5
+; GFX6-NEXT: s_lshr_b32 s36, s4, 30
+; GFX6-NEXT: s_lshr_b32 s38, s4, 31
+; GFX6-NEXT: s_lshr_b32 s30, s4, 28
+; GFX6-NEXT: s_lshr_b32 s34, s4, 29
+; GFX6-NEXT: s_lshr_b32 s26, s4, 26
+; GFX6-NEXT: s_lshr_b32 s28, s4, 27
+; GFX6-NEXT: s_lshr_b32 s22, s4, 24
+; GFX6-NEXT: s_lshr_b32 s24, s4, 25
+; GFX6-NEXT: s_lshr_b32 s18, s4, 22
+; GFX6-NEXT: s_lshr_b32 s20, s4, 23
+; GFX6-NEXT: s_lshr_b32 s14, s4, 20
+; GFX6-NEXT: s_lshr_b32 s16, s4, 21
+; GFX6-NEXT: s_lshr_b32 s10, s4, 18
+; GFX6-NEXT: s_lshr_b32 s12, s4, 19
+; GFX6-NEXT: s_lshr_b32 s6, s4, 16
+; GFX6-NEXT: s_lshr_b32 s8, s4, 17
; GFX6-NEXT: s_ashr_i32 s7, s5, 31
-; GFX6-NEXT: s_bfe_i64 s[44:45], s[40:41], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000
; GFX6-NEXT: v_mov_b32_e32 v4, s7
-; GFX6-NEXT: s_lshr_b32 s40, s5, 12
+; GFX6-NEXT: s_lshr_b32 s40, s4, 14
; GFX6-NEXT: v_mov_b32_e32 v0, s44
; GFX6-NEXT: v_mov_b32_e32 v1, s45
-; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000
+; GFX6-NEXT: s_mov_b32 s44, s5
+; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
; GFX6-NEXT: v_mov_b32_e32 v6, s44
; GFX6-NEXT: v_mov_b32_e32 v7, s45
-; GFX6-NEXT: s_lshr_b32 s44, s5, 13
+; GFX6-NEXT: s_lshr_b32 s44, s4, 15
; GFX6-NEXT: v_mov_b32_e32 v2, s42
; GFX6-NEXT: v_mov_b32_e32 v3, s43
-; GFX6-NEXT: s_lshr_b32 s42, s5, 10
+; GFX6-NEXT: s_lshr_b32 s42, s4, 12
; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
; GFX6-NEXT: v_mov_b32_e32 v8, s36
; GFX6-NEXT: v_mov_b32_e32 v9, s37
-; GFX6-NEXT: s_lshr_b32 s36, s5, 11
+; GFX6-NEXT: s_lshr_b32 s36, s4, 13
; GFX6-NEXT: v_mov_b32_e32 v10, s38
; GFX6-NEXT: v_mov_b32_e32 v11, s39
-; GFX6-NEXT: s_lshr_b32 s38, s5, 8
+; GFX6-NEXT: s_lshr_b32 s38, s4, 10
; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
; GFX6-NEXT: v_mov_b32_e32 v12, s30
; GFX6-NEXT: v_mov_b32_e32 v13, s31
-; GFX6-NEXT: s_lshr_b32 s30, s5, 9
+; GFX6-NEXT: s_lshr_b32 s30, s4, 11
; GFX6-NEXT: v_mov_b32_e32 v14, s34
; GFX6-NEXT: v_mov_b32_e32 v15, s35
-; GFX6-NEXT: s_lshr_b32 s34, s5, 6
+; GFX6-NEXT: s_lshr_b32 s34, s4, 8
; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
; GFX6-NEXT: v_mov_b32_e32 v5, s7
@@ -8397,190 +8397,191 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s26
; GFX6-NEXT: v_mov_b32_e32 v3, s27
-; GFX6-NEXT: s_lshr_b32 s26, s5, 7
+; GFX6-NEXT: s_lshr_b32 s26, s4, 9
; GFX6-NEXT: v_mov_b32_e32 v4, s28
; GFX6-NEXT: v_mov_b32_e32 v5, s29
-; GFX6-NEXT: s_lshr_b32 s28, s5, 4
+; GFX6-NEXT: s_lshr_b32 s28, s4, 6
; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:240
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v8, s22
; GFX6-NEXT: v_mov_b32_e32 v9, s23
-; GFX6-NEXT: s_lshr_b32 s22, s5, 5
+; GFX6-NEXT: s_lshr_b32 s22, s4, 7
; GFX6-NEXT: v_mov_b32_e32 v10, s24
; GFX6-NEXT: v_mov_b32_e32 v11, s25
-; GFX6-NEXT: s_lshr_b32 s24, s5, 2
+; GFX6-NEXT: s_lshr_b32 s24, s4, 4
; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:224
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v12, s18
; GFX6-NEXT: v_mov_b32_e32 v13, s19
-; GFX6-NEXT: s_lshr_b32 s18, s5, 3
+; GFX6-NEXT: s_lshr_b32 s18, s4, 5
; GFX6-NEXT: v_mov_b32_e32 v14, s20
; GFX6-NEXT: v_mov_b32_e32 v15, s21
-; GFX6-NEXT: s_lshr_b32 s20, s5, 1
+; GFX6-NEXT: s_lshr_b32 s20, s4, 2
; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:448
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:208
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s14
; GFX6-NEXT: v_mov_b32_e32 v3, s15
-; GFX6-NEXT: s_lshr_b32 s14, s4, 30
+; GFX6-NEXT: s_lshr_b32 s14, s4, 3
; GFX6-NEXT: v_mov_b32_e32 v4, s16
; GFX6-NEXT: v_mov_b32_e32 v5, s17
-; GFX6-NEXT: s_lshr_b32 s16, s4, 31
+; GFX6-NEXT: s_lshr_b32 s16, s4, 1
; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:432
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v8, s10
; GFX6-NEXT: v_mov_b32_e32 v9, s11
-; GFX6-NEXT: s_lshr_b32 s10, s4, 28
+; GFX6-NEXT: s_lshr_b32 s10, s5, 29
; GFX6-NEXT: v_mov_b32_e32 v10, s12
; GFX6-NEXT: v_mov_b32_e32 v11, s13
-; GFX6-NEXT: s_lshr_b32 s12, s4, 29
+; GFX6-NEXT: s_lshr_b32 s12, s5, 28
; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:416
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v12, s6
; GFX6-NEXT: v_mov_b32_e32 v13, s7
-; GFX6-NEXT: s_lshr_b32 s46, s4, 26
+; GFX6-NEXT: s_lshr_b32 s6, s5, 26
; GFX6-NEXT: v_mov_b32_e32 v14, s8
; GFX6-NEXT: v_mov_b32_e32 v15, s9
-; GFX6-NEXT: s_lshr_b32 s8, s4, 27
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x10000
+; GFX6-NEXT: s_lshr_b32 s8, s5, 27
+; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:400
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s40
; GFX6-NEXT: v_mov_b32_e32 v3, s41
-; GFX6-NEXT: s_lshr_b32 s40, s4, 24
-; GFX6-NEXT: v_mov_b32_e32 v4, s6
-; GFX6-NEXT: v_mov_b32_e32 v5, s7
-; GFX6-NEXT: s_lshr_b32 s44, s4, 25
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[36:37], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[36:37], s[42:43], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:384
+; GFX6-NEXT: s_lshr_b32 s40, s5, 25
+; GFX6-NEXT: v_mov_b32_e32 v4, s44
+; GFX6-NEXT: v_mov_b32_e32 v5, s45
+; GFX6-NEXT: s_lshr_b32 s44, s5, 24
+; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v8, s36
-; GFX6-NEXT: v_mov_b32_e32 v9, s37
-; GFX6-NEXT: s_lshr_b32 s36, s4, 22
-; GFX6-NEXT: v_mov_b32_e32 v10, s6
-; GFX6-NEXT: v_mov_b32_e32 v11, s7
-; GFX6-NEXT: s_lshr_b32 s42, s4, 23
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[30:31], s[38:39], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:368
+; GFX6-NEXT: v_mov_b32_e32 v8, s42
+; GFX6-NEXT: v_mov_b32_e32 v9, s43
+; GFX6-NEXT: s_lshr_b32 s42, s5, 22
+; GFX6-NEXT: v_mov_b32_e32 v10, s36
+; GFX6-NEXT: v_mov_b32_e32 v11, s37
+; GFX6-NEXT: s_lshr_b32 s36, s5, 23
+; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v12, s30
-; GFX6-NEXT: v_mov_b32_e32 v13, s31
-; GFX6-NEXT: s_lshr_b32 s30, s4, 20
-; GFX6-NEXT: v_mov_b32_e32 v14, s6
-; GFX6-NEXT: v_mov_b32_e32 v15, s7
-; GFX6-NEXT: s_lshr_b32 s6, s4, 21
+; GFX6-NEXT: v_mov_b32_e32 v12, s38
+; GFX6-NEXT: v_mov_b32_e32 v13, s39
+; GFX6-NEXT: s_lshr_b32 s38, s5, 20
+; GFX6-NEXT: v_mov_b32_e32 v14, s30
+; GFX6-NEXT: v_mov_b32_e32 v15, s31
+; GFX6-NEXT: s_lshr_b32 s4, s5, 21
; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:352
-; GFX6-NEXT: v_mov_b32_e32 v16, s34
-; GFX6-NEXT: v_mov_b32_e32 v17, s35
-; GFX6-NEXT: s_lshr_b32 s34, s4, 18
-; GFX6-NEXT: v_mov_b32_e32 v18, s26
-; GFX6-NEXT: v_mov_b32_e32 v19, s27
-; GFX6-NEXT: s_lshr_b32 s26, s4, 19
+; GFX6-NEXT: s_bfe_i64 s[30:31], s[34:35], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s30
+; GFX6-NEXT: v_mov_b32_e32 v3, s31
+; GFX6-NEXT: s_lshr_b32 s30, s5, 18
+; GFX6-NEXT: v_mov_b32_e32 v4, s26
+; GFX6-NEXT: v_mov_b32_e32 v5, s27
+; GFX6-NEXT: s_lshr_b32 s26, s5, 19
; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:336
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v8, s28
; GFX6-NEXT: v_mov_b32_e32 v9, s29
-; GFX6-NEXT: s_lshr_b32 s28, s4, 16
+; GFX6-NEXT: s_lshr_b32 s28, s5, 17
; GFX6-NEXT: v_mov_b32_e32 v10, s22
; GFX6-NEXT: v_mov_b32_e32 v11, s23
-; GFX6-NEXT: s_lshr_b32 s22, s4, 17
+; GFX6-NEXT: s_lshr_b32 s22, s5, 16
+; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:320
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v12, s24
; GFX6-NEXT: v_mov_b32_e32 v13, s25
-; GFX6-NEXT: s_lshr_b32 s24, s4, 14
-; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX6-NEXT: s_lshr_b32 s24, s5, 14
; GFX6-NEXT: v_mov_b32_e32 v14, s18
; GFX6-NEXT: v_mov_b32_e32 v15, s19
-; GFX6-NEXT: s_lshr_b32 s18, s4, 15
-; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: v_mov_b32_e32 v3, s21
-; GFX6-NEXT: s_lshr_b32 s20, s4, 12
-; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX6-NEXT: s_lshr_b32 s18, s5, 15
+; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64
+; GFX6-NEXT: v_mov_b32_e32 v16, s20
+; GFX6-NEXT: v_mov_b32_e32 v17, s21
+; GFX6-NEXT: s_lshr_b32 s20, s5, 12
; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:304
+; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v18, s14
+; GFX6-NEXT: v_mov_b32_e32 v19, s15
+; GFX6-NEXT: s_lshr_b32 s14, s5, 13
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v16, s14
-; GFX6-NEXT: v_mov_b32_e32 v17, s15
-; GFX6-NEXT: s_lshr_b32 s14, s4, 13
-; GFX6-NEXT: v_mov_b32_e32 v18, s16
-; GFX6-NEXT: v_mov_b32_e32 v19, s17
-; GFX6-NEXT: s_lshr_b32 s16, s4, 10
+; GFX6-NEXT: v_mov_b32_e32 v2, s16
+; GFX6-NEXT: v_mov_b32_e32 v3, s17
+; GFX6-NEXT: s_lshr_b32 s16, s5, 10
; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:288
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v8, s10
-; GFX6-NEXT: v_mov_b32_e32 v9, s11
-; GFX6-NEXT: s_lshr_b32 s10, s4, 11
-; GFX6-NEXT: v_mov_b32_e32 v10, s12
-; GFX6-NEXT: v_mov_b32_e32 v11, s13
-; GFX6-NEXT: s_lshr_b32 s12, s4, 8
+; GFX6-NEXT: v_mov_b32_e32 v8, s12
+; GFX6-NEXT: v_mov_b32_e32 v9, s13
+; GFX6-NEXT: s_lshr_b32 s12, s5, 11
+; GFX6-NEXT: v_mov_b32_e32 v10, s10
+; GFX6-NEXT: v_mov_b32_e32 v11, s11
+; GFX6-NEXT: s_lshr_b32 s10, s5, 8
; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[38:39], s[46:47], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:272
+; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v12, s38
-; GFX6-NEXT: v_mov_b32_e32 v13, s39
-; GFX6-NEXT: s_lshr_b32 s38, s4, 9
+; GFX6-NEXT: v_mov_b32_e32 v12, s6
+; GFX6-NEXT: v_mov_b32_e32 v13, s7
+; GFX6-NEXT: s_lshr_b32 s6, s5, 9
; GFX6-NEXT: v_mov_b32_e32 v14, s8
; GFX6-NEXT: v_mov_b32_e32 v15, s9
-; GFX6-NEXT: s_lshr_b32 s8, s4, 6
-; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX6-NEXT: s_lshr_b32 s8, s5, 6
+; GFX6-NEXT: s_bfe_i64 s[34:35], s[44:45], 0x10000
; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256
+; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s40
-; GFX6-NEXT: v_mov_b32_e32 v1, s41
-; GFX6-NEXT: s_lshr_b32 s40, s4, 7
-; GFX6-NEXT: v_mov_b32_e32 v2, s44
-; GFX6-NEXT: v_mov_b32_e32 v3, s45
-; GFX6-NEXT: s_lshr_b32 s44, s4, 4
-; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v16, s34
+; GFX6-NEXT: v_mov_b32_e32 v17, s35
+; GFX6-NEXT: s_lshr_b32 s34, s5, 7
+; GFX6-NEXT: v_mov_b32_e32 v18, s40
+; GFX6-NEXT: v_mov_b32_e32 v19, s41
+; GFX6-NEXT: s_lshr_b32 s40, s5, 4
; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240
+; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v16, s36
-; GFX6-NEXT: v_mov_b32_e32 v17, s37
-; GFX6-NEXT: s_lshr_b32 s36, s4, 5
-; GFX6-NEXT: v_mov_b32_e32 v18, s42
-; GFX6-NEXT: v_mov_b32_e32 v19, s43
-; GFX6-NEXT: s_lshr_b32 s42, s4, 2
-; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224
+; GFX6-NEXT: v_mov_b32_e32 v0, s42
+; GFX6-NEXT: v_mov_b32_e32 v1, s43
+; GFX6-NEXT: s_lshr_b32 s42, s5, 5
+; GFX6-NEXT: v_mov_b32_e32 v2, s36
+; GFX6-NEXT: v_mov_b32_e32 v3, s37
+; GFX6-NEXT: s_lshr_b32 s36, s5, 2
+; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v8, s30
-; GFX6-NEXT: v_mov_b32_e32 v9, s31
-; GFX6-NEXT: s_lshr_b32 s30, s4, 3
-; GFX6-NEXT: s_lshr_b32 s4, s4, 1
-; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX6-NEXT: v_mov_b32_e32 v8, s38
+; GFX6-NEXT: v_mov_b32_e32 v9, s39
+; GFX6-NEXT: s_lshr_b32 s38, s5, 3
+; GFX6-NEXT: s_lshr_b32 s44, s5, 1
; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
@@ -8589,71 +8590,71 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
-; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
-; GFX6-NEXT: v_mov_b32_e32 v10, s6
-; GFX6-NEXT: v_mov_b32_e32 v11, s7
-; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160
-; GFX6-NEXT: s_waitcnt expcnt(2)
-; GFX6-NEXT: v_mov_b32_e32 v0, s34
-; GFX6-NEXT: v_mov_b32_e32 v1, s35
+; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464
+; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:448
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432
+; GFX6-NEXT: v_mov_b32_e32 v10, s4
+; GFX6-NEXT: v_mov_b32_e32 v11, s5
+; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:416
+; GFX6-NEXT: s_waitcnt expcnt(1)
+; GFX6-NEXT: v_mov_b32_e32 v0, s30
+; GFX6-NEXT: v_mov_b32_e32 v1, s31
; GFX6-NEXT: v_mov_b32_e32 v2, s26
; GFX6-NEXT: v_mov_b32_e32 v3, s27
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s28
-; GFX6-NEXT: v_mov_b32_e32 v1, s29
-; GFX6-NEXT: v_mov_b32_e32 v2, s22
-; GFX6-NEXT: v_mov_b32_e32 v3, s23
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
+; GFX6-NEXT: v_mov_b32_e32 v0, s22
+; GFX6-NEXT: v_mov_b32_e32 v1, s23
+; GFX6-NEXT: v_mov_b32_e32 v2, s28
+; GFX6-NEXT: v_mov_b32_e32 v3, s29
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s24
; GFX6-NEXT: v_mov_b32_e32 v1, s25
; GFX6-NEXT: v_mov_b32_e32 v2, s18
; GFX6-NEXT: v_mov_b32_e32 v3, s19
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s20
; GFX6-NEXT: v_mov_b32_e32 v1, s21
; GFX6-NEXT: v_mov_b32_e32 v2, s14
; GFX6-NEXT: v_mov_b32_e32 v3, s15
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s16
; GFX6-NEXT: v_mov_b32_e32 v1, s17
-; GFX6-NEXT: v_mov_b32_e32 v2, s10
-; GFX6-NEXT: v_mov_b32_e32 v3, s11
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GFX6-NEXT: v_mov_b32_e32 v2, s12
+; GFX6-NEXT: v_mov_b32_e32 v3, s13
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s12
-; GFX6-NEXT: v_mov_b32_e32 v1, s13
-; GFX6-NEXT: v_mov_b32_e32 v2, s38
-; GFX6-NEXT: v_mov_b32_e32 v3, s39
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GFX6-NEXT: v_mov_b32_e32 v0, s10
+; GFX6-NEXT: v_mov_b32_e32 v1, s11
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
+; GFX6-NEXT: v_mov_b32_e32 v3, s7
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: v_mov_b32_e32 v1, s9
-; GFX6-NEXT: v_mov_b32_e32 v2, s40
-; GFX6-NEXT: v_mov_b32_e32 v3, s41
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GFX6-NEXT: v_mov_b32_e32 v2, s34
+; GFX6-NEXT: v_mov_b32_e32 v3, s35
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s44
-; GFX6-NEXT: v_mov_b32_e32 v1, s45
-; GFX6-NEXT: v_mov_b32_e32 v2, s36
-; GFX6-NEXT: v_mov_b32_e32 v3, s37
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GFX6-NEXT: v_mov_b32_e32 v0, s40
+; GFX6-NEXT: v_mov_b32_e32 v1, s41
+; GFX6-NEXT: v_mov_b32_e32 v2, s42
+; GFX6-NEXT: v_mov_b32_e32 v3, s43
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s42
-; GFX6-NEXT: v_mov_b32_e32 v1, s43
-; GFX6-NEXT: v_mov_b32_e32 v2, s30
-; GFX6-NEXT: v_mov_b32_e32 v3, s31
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GFX6-NEXT: v_mov_b32_e32 v8, s4
-; GFX6-NEXT: v_mov_b32_e32 v9, s5
-; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0
+; GFX6-NEXT: v_mov_b32_e32 v0, s36
+; GFX6-NEXT: v_mov_b32_e32 v1, s37
+; GFX6-NEXT: v_mov_b32_e32 v2, s38
+; GFX6-NEXT: v_mov_b32_e32 v3, s39
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272
+; GFX6-NEXT: v_mov_b32_e32 v8, s44
+; GFX6-NEXT: v_mov_b32_e32 v9, s45
+; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:256
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: constant_sextload_v64i1_to_v64i64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 4491c4b766db9..39191a58ed979 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -1643,15 +1643,15 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s6, s4, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s5, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s6, s5, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s4, 16
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
@@ -1666,14 +1666,14 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT: s_ashr_i32 s0, s2, 16
-; GCN-HSA-NEXT: s_ashr_i32 s1, s3, 16
+; GCN-HSA-NEXT: s_ashr_i32 s0, s3, 16
+; GCN-HSA-NEXT: s_ashr_i32 s1, s2, 16
; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3
; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
@@ -6539,33 +6539,33 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s5
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s6, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s6, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s5
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 16
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x100000
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s5, 31
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s5, 16
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x100000
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s7, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s7, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x100000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s12, s7, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s7, 16
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[14:15], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[12:13], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s12
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s15
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s13
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s16
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s17
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10
@@ -6586,8 +6586,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_mov_b32 s2, s7
-; GCN-HSA-NEXT: s_mov_b32 s8, s5
-; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16
+; GCN-HSA-NEXT: s_lshr_b32 s8, s6, 16
+; GCN-HSA-NEXT: s_mov_b32 s10, s5
; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16
; GCN-HSA-NEXT: s_ashr_i32 s13, s5, 16
; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x100000
@@ -6605,25 +6605,25 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s6
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s6
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
@@ -7161,12 +7161,12 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s7
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s5
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s3
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s1
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s4, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s2, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s6, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s5
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s4, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s3
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s2, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s1
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s0, 16
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[0:1], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x100000
@@ -7174,60 +7174,60 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x100000
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s1, 31
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s1, 16
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[0:1], s[18:19], 0x100000
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s3, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s3, 16
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[16:17], 0x100000
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 16
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x100000
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s7, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s40, s7, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[0:1], s[24:25], 0x100000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s3, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s3, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[20:21], 0x100000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s5, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s5, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x100000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s7, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s7, 16
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[24:25], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[22:23], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[20:21], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s40
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s27
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s20
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:80
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s25
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s24
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s30
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s31
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s1
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s23
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s21
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s18
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s19
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:64
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s17
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13
@@ -7243,19 +7243,19 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT: s_mov_b32 s12, s7
+; GCN-HSA-NEXT: s_mov_b32 s10, s7
+; GCN-HSA-NEXT: s_lshr_b32 s12, s6, 16
; GCN-HSA-NEXT: s_mov_b32 s14, s5
-; GCN-HSA-NEXT: s_mov_b32 s16, s3
-; GCN-HSA-NEXT: s_mov_b32 s18, s1
-; GCN-HSA-NEXT: s_ashr_i32 s27, s1, 31
+; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 16
+; GCN-HSA-NEXT: s_ashr_i32 s25, s1, 31
; GCN-HSA-NEXT: s_ashr_i32 s29, s3, 31
; GCN-HSA-NEXT: s_ashr_i32 s30, s3, 16
-; GCN-HSA-NEXT: s_lshr_b32 s20, s6, 16
-; GCN-HSA-NEXT: s_lshr_b32 s22, s4, 16
-; GCN-HSA-NEXT: s_lshr_b32 s24, s2, 16
-; GCN-HSA-NEXT: s_lshr_b32 s26, s0, 16
-; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[2:3], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x100000
+; GCN-HSA-NEXT: s_mov_b32 s18, s3
+; GCN-HSA-NEXT: s_lshr_b32 s20, s2, 16
+; GCN-HSA-NEXT: s_mov_b32 s22, s1
+; GCN-HSA-NEXT: s_lshr_b32 s24, s0, 16
+; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[2:3], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[10:11], 0x100000
; GCN-HSA-NEXT: s_ashr_i32 s28, s1, 16
; GCN-HSA-NEXT: s_ashr_i32 s31, s5, 31
; GCN-HSA-NEXT: s_ashr_i32 s33, s5, 16
@@ -7266,55 +7266,36 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[26:27], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[24:25], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[22:23], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
-; GCN-HSA-NEXT: s_add_u32 s24, s8, 0x70
-; GCN-HSA-NEXT: s_addc_u32 s25, s9, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14
-; GCN-HSA-NEXT: s_add_u32 s14, s8, 0x50
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15
-; GCN-HSA-NEXT: s_addc_u32 s15, s9, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15
+; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
+; GCN-HSA-NEXT: s_add_u32 s22, s8, 0x70
+; GCN-HSA-NEXT: s_addc_u32 s23, s9, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
+; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x60
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s34
-; GCN-HSA-NEXT: s_add_u32 s14, s8, 48
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s33
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s31
+; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x50
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s12
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-HSA-NEXT: s_addc_u32 s15, s9, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15
-; GCN-HSA-NEXT: s_add_u32 s14, s8, 16
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: s_addc_u32 s15, s9, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: s_nop 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
-; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x60
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7
; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s31
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_nop 0
@@ -7323,17 +7304,35 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s23
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT: s_add_u32 s4, s8, 48
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: s_add_u32 s4, s8, 32
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT: s_add_u32 s4, s8, 16
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
@@ -8307,148 +8306,151 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15
; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s11
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s9
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s7
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s1, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s1, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s3, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s3, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s5, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s5, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s7, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s7, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s9, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s9, 16
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[26:27], 0x100000
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s11, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s11, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s9
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s1, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s1, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s3, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s3, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s5, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s7, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s7, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s9, 31
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[24:25], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[20:21], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[18:19], 0x100000
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s13, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s13, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s15, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s15, 16
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s54, s5
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s3
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s56, s1
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s14, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s12, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s10, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s8, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s6, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s4, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s2, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s0, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[22:23], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[20:21], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[18:19], 0x100000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s9, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s11, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s11, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s13, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s13, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s15, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s15, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s14, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s12, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s10, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s8, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s7
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s6, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s5
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s4, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s3
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s2, 16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s1
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s0, 16
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[14:15], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[14:15], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s52
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s53
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s50
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s51
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s48
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s49
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s46
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s47
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s54
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s55
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s52
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s53
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s50
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s51
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s48
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s49
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s58
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s57
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s56
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[22:23], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[54:55], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[56:57], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s15
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s61
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s60
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s59
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s58
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s57
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s55
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s37
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s35
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s31
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s29
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s16
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s17
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s27
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s23
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[42:43], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[40:41], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[38:39], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s45
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s43
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[46:47], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[44:45], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[42:43], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s47
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s45
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s43
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s39
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s46
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s47
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s37
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s35
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s33
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s31
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s29
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[38:39], 0x100000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s27
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s40
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s41
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s25
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s12
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s13
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s10
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s11
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s9
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[36:37], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[34:35], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[30:31], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[28:29], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s6
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s7
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s4
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s5
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s21
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s31
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s36
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s37
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s34
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s35
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s19
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s28
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s26
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s27
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s26
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s27
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s17
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s22
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s23
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s14
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s15
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s16
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s17
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s9
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64:
@@ -8460,47 +8462,47 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT: s_mov_b32 s34, s15
-; GCN-HSA-NEXT: s_ashr_i32 s41, s3, 31
-; GCN-HSA-NEXT: s_ashr_i32 s42, s3, 16
-; GCN-HSA-NEXT: s_ashr_i32 s57, s5, 16
-; GCN-HSA-NEXT: s_ashr_i32 s59, s7, 31
-; GCN-HSA-NEXT: s_ashr_i32 s61, s7, 16
-; GCN-HSA-NEXT: s_ashr_i32 s63, s9, 31
-; GCN-HSA-NEXT: s_ashr_i32 s65, s9, 16
-; GCN-HSA-NEXT: s_ashr_i32 s67, s11, 31
-; GCN-HSA-NEXT: s_ashr_i32 s69, s11, 16
-; GCN-HSA-NEXT: s_mov_b32 s44, s13
-; GCN-HSA-NEXT: s_mov_b32 s46, s11
-; GCN-HSA-NEXT: s_mov_b32 s48, s9
-; GCN-HSA-NEXT: s_mov_b32 s50, s7
-; GCN-HSA-NEXT: s_mov_b32 s52, s5
-; GCN-HSA-NEXT: s_mov_b32 s38, s3
-; GCN-HSA-NEXT: s_mov_b32 s36, s1
-; GCN-HSA-NEXT: s_lshr_b32 s54, s14, 16
-; GCN-HSA-NEXT: s_lshr_b32 s56, s12, 16
-; GCN-HSA-NEXT: s_lshr_b32 s58, s10, 16
-; GCN-HSA-NEXT: s_lshr_b32 s60, s8, 16
-; GCN-HSA-NEXT: s_lshr_b32 s62, s6, 16
-; GCN-HSA-NEXT: s_lshr_b32 s64, s4, 16
-; GCN-HSA-NEXT: s_lshr_b32 s66, s2, 16
+; GCN-HSA-NEXT: s_mov_b32 s24, s15
+; GCN-HSA-NEXT: s_ashr_i32 s37, s3, 31
+; GCN-HSA-NEXT: s_ashr_i32 s38, s3, 16
+; GCN-HSA-NEXT: s_ashr_i32 s57, s11, 16
+; GCN-HSA-NEXT: s_ashr_i32 s59, s13, 31
+; GCN-HSA-NEXT: s_ashr_i32 s61, s13, 16
+; GCN-HSA-NEXT: s_ashr_i32 s63, s15, 31
+; GCN-HSA-NEXT: s_ashr_i32 s65, s15, 16
+; GCN-HSA-NEXT: s_lshr_b32 s46, s14, 16
+; GCN-HSA-NEXT: s_mov_b32 s48, s13
+; GCN-HSA-NEXT: s_lshr_b32 s50, s12, 16
+; GCN-HSA-NEXT: s_mov_b32 s52, s11
+; GCN-HSA-NEXT: s_lshr_b32 s34, s10, 16
+; GCN-HSA-NEXT: s_mov_b32 s30, s9
+; GCN-HSA-NEXT: s_lshr_b32 s28, s8, 16
+; GCN-HSA-NEXT: s_mov_b32 s54, s7
+; GCN-HSA-NEXT: s_lshr_b32 s56, s6, 16
+; GCN-HSA-NEXT: s_mov_b32 s58, s5
+; GCN-HSA-NEXT: s_lshr_b32 s60, s4, 16
+; GCN-HSA-NEXT: s_mov_b32 s62, s3
+; GCN-HSA-NEXT: s_lshr_b32 s64, s2, 16
+; GCN-HSA-NEXT: s_mov_b32 s66, s1
; GCN-HSA-NEXT: s_lshr_b32 s68, s0, 16
; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[34:35], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x100000
; GCN-HSA-NEXT: s_ashr_i32 s33, s1, 31
-; GCN-HSA-NEXT: s_ashr_i32 s40, s1, 16
-; GCN-HSA-NEXT: s_ashr_i32 s43, s5, 31
-; GCN-HSA-NEXT: s_ashr_i32 s70, s13, 31
-; GCN-HSA-NEXT: s_ashr_i32 s71, s13, 16
-; GCN-HSA-NEXT: s_ashr_i32 s72, s15, 31
-; GCN-HSA-NEXT: s_ashr_i32 s73, s15, 16
+; GCN-HSA-NEXT: s_ashr_i32 s36, s1, 16
+; GCN-HSA-NEXT: s_ashr_i32 s39, s5, 31
+; GCN-HSA-NEXT: s_ashr_i32 s40, s5, 16
+; GCN-HSA-NEXT: s_ashr_i32 s41, s7, 31
+; GCN-HSA-NEXT: s_ashr_i32 s42, s7, 16
+; GCN-HSA-NEXT: s_ashr_i32 s43, s9, 31
+; GCN-HSA-NEXT: s_ashr_i32 s44, s9, 16
+; GCN-HSA-NEXT: s_ashr_i32 s45, s11, 31
; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[4:5], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[10:11], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[12:13], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[14:15], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[70:71], s[10:11], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[72:73], s[12:13], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[74:75], s[14:15], 0x100000
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x100000
@@ -8510,149 +8512,149 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[56:57], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[54:55], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[54:55], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000
+; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000
-; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000
; GCN-HSA-NEXT: s_add_u32 s54, s16, 0xf0
; GCN-HSA-NEXT: s_addc_u32 s55, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s44
-; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xd0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s45
-; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s44
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s45
-; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xb0
-; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s44
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s45
-; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x90
-; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s44
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46
+; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xe0
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47
+; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s46
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s47
+; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xd0
+; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s46
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s47
+; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xc0
+; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s54
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, s45
-; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70
+; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47
+; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xb0
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s55
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s73
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s72
-; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s65
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s63
+; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s71
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s38
-; GCN-HSA-NEXT: s_add_u32 s38, s16, 0x50
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s70
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39
-; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s47
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s69
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s67
-; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[4:7]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[8:11]
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36
-; GCN-HSA-NEXT: s_add_u32 s36, s16, 48
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s37
-; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s36
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s37
-; GCN-HSA-NEXT: s_add_u32 s36, s16, 16
-; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s30
-; GCN-HSA-NEXT: s_add_u32 s30, s16, 0xe0
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s48
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s49
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s65
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s63
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s31
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s74
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30
+; GCN-HSA-NEXT: s_add_u32 s30, s16, 0xa0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s75
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31
; GCN-HSA-NEXT: s_addc_u32 s31, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v29, s44
-; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[12:15]
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, s50
+; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s48
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26
+; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x90
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27
+; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27
+; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x80
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s49
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s61
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s59
+; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s72
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24
+; GCN-HSA-NEXT: s_add_u32 s24, s16, 0x70
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s73
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s50
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s51
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25
+; GCN-HSA-NEXT: s_addc_u32 s25, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v30, s46
+; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s52
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14
-; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xc0
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s51
-; GCN-HSA-NEXT: v_mov_b32_e32 v30, s45
-; GCN-HSA-NEXT: v_mov_b32_e32 v18, s61
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s38
+; GCN-HSA-NEXT: s_add_u32 s14, s16, 0x60
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s53
+; GCN-HSA-NEXT: v_mov_b32_e32 v31, s47
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, s57
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15
; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s52
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s53
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s57
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s43
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s39
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s36
-; GCN-HSA-NEXT: flat_store_dwordx4 v[29:30], v[16:19]
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s37
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, s30
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s70
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s71
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s34
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s35
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s44
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s43
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26
+; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s24
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s15
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s40
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s28
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s29
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23]
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s28
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s34
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s35
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s29
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s31
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s25
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s14
; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12
-; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xa0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12
+; GCN-HSA-NEXT: s_add_u32 s12, s16, 0x50
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13
; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s40
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_nop 0
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10
-; GCN-HSA-NEXT: s_add_u32 s10, s16, 0x80
+; GCN-HSA-NEXT: s_add_u32 s10, s16, 64
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11
; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_nop 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8
-; GCN-HSA-NEXT: s_add_u32 s8, s16, 0x60
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
+; GCN-HSA-NEXT: s_add_u32 s8, s16, 48
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9
; GCN-HSA-NEXT: s_addc_u32 s9, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_nop 0
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6
-; GCN-HSA-NEXT: s_add_u32 s6, s16, 64
+; GCN-HSA-NEXT: s_add_u32 s6, s16, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7
; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_nop 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4
-; GCN-HSA-NEXT: s_add_u32 s4, s16, 32
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index b39b38a420233..bc1998240ba32 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -6398,41 +6398,41 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s5, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s5, 8
-; GFX6-NOHSA-NEXT: s_mov_b32 s10, s5
-; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s4, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s4, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s4, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s4, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s4, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s5, 8
+; GFX6-NOHSA-NEXT: s_mov_b32 s16, s5
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[4:5], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s5, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s5, 31
; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s5, 24
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s10
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s11
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s19
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s16
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s17
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s8
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s9
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s14
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s11
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s12
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s13
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s4
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; GFX6-NOHSA-NEXT: s_endpgm
;
; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64:
@@ -6445,11 +6445,11 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s4, s3, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s6, s3, 8
-; GFX7-HSA-NEXT: s_mov_b32 s8, s3
-; GFX7-HSA-NEXT: s_lshr_b32 s10, s2, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s12, s2, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s14, s2, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s6, s2, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s8, s2, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s10, s2, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s12, s3, 8
+; GFX7-HSA-NEXT: s_mov_b32 s14, s3
; GFX7-HSA-NEXT: s_ashr_i32 s5, s3, 31
; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
@@ -6465,32 +6465,32 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32
+; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
-; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s9
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11
+; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_endpgm
@@ -6502,11 +6502,11 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s3, 8
-; GFX8-NOHSA-NEXT: s_mov_b32 s8, s3
-; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s2, 8
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 8
+; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s3, 8
+; GFX8-NOHSA-NEXT: s_mov_b32 s14, s3
; GFX8-NOHSA-NEXT: s_ashr_i32 s5, s3, 31
; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
@@ -6522,32 +6522,32 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -6615,34 +6615,34 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshr_b32 s4, s3, 16
-; GFX12-NEXT: s_lshr_b32 s6, s3, 8
-; GFX12-NEXT: s_mov_b32 s8, s3
-; GFX12-NEXT: s_lshr_b32 s10, s2, 16
-; GFX12-NEXT: s_lshr_b32 s12, s2, 24
+; GFX12-NEXT: s_lshr_b32 s6, s2, 16
+; GFX12-NEXT: s_lshr_b32 s8, s2, 24
+; GFX12-NEXT: s_lshr_b32 s10, s2, 8
+; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX12-NEXT: s_lshr_b32 s12, s3, 8
+; GFX12-NEXT: s_mov_b32 s14, s3
; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000
; GFX12-NEXT: s_ashr_i32 s15, s3, 31
; GFX12-NEXT: s_ashr_i32 s18, s3, 24
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GFX12-NEXT: s_lshr_b32 s14, s2, 8
-; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s15
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s7
+; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s9
+; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v7, s11
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[14:15], 0x80000
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v5, s17
; GFX12-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s9
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[14:15], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s7
-; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v13, s11
-; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13
-; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v7, s3
-; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v13, s3
+; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v15, s13
+; GFX12-NEXT: v_mov_b32_e32 v14, s12
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:32
; GFX12-NEXT: s_endpgm
%load = load <8 x i8>, ptr addrspace(4) %in
%ext = sext <8 x i8> %load to <8 x i64>
@@ -7033,80 +7033,81 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s7, 8
-; GFX6-NOHSA-NEXT: s_mov_b32 s14, s7
-; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 8
-; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s5, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s5, 8
-; GFX6-NOHSA-NEXT: s_mov_b32 s26, s5
-; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s4, 8
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[4:5], 0x80000
+; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s7, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 8
+; GFX6-NOHSA-NEXT: s_mov_b32 s26, s7
+; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s5, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s5, 8
+; GFX6-NOHSA-NEXT: s_mov_b32 s8, s5
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s5, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s5, 24
; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s5, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s5, 24
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s7, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s7, 31
; GFX6-NOHSA-NEXT: s_ashr_i32 s38, s7, 24
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s38
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s14
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s15
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s36
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s37
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s33
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s26
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s27
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s11
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s34
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s35
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s33
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s26
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s27
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s13
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s14
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s15
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s12
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s13
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s31
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s29
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s16
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s19
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s20
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s18
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s24
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s28
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s22
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s23
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s10
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s11
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s7
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GFX6-NOHSA-NEXT: s_endpgm
;
; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64:
@@ -7118,31 +7119,33 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: s_lshr_b32 s8, s7, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 8
-; GFX7-HSA-NEXT: s_mov_b32 s12, s7
-; GFX7-HSA-NEXT: s_lshr_b32 s14, s6, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 8
-; GFX7-HSA-NEXT: s_lshr_b32 s20, s5, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 8
-; GFX7-HSA-NEXT: s_ashr_i32 s29, s5, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s31, s5, 24
-; GFX7-HSA-NEXT: s_mov_b32 s24, s5
-; GFX7-HSA-NEXT: s_lshr_b32 s26, s4, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s28, s4, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s30, s4, 8
-; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[8:9], 0x80000
+; GFX7-HSA-NEXT: s_lshr_b32 s2, s6, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s8, s6, 24
+; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s12, s4, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s14, s4, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s16, s4, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s18, s7, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s20, s7, 8
+; GFX7-HSA-NEXT: s_ashr_i32 s27, s5, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s29, s5, 24
+; GFX7-HSA-NEXT: s_mov_b32 s22, s7
+; GFX7-HSA-NEXT: s_lshr_b32 s24, s5, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s26, s5, 8
+; GFX7-HSA-NEXT: s_mov_b32 s28, s5
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x80000
; GFX7-HSA-NEXT: s_ashr_i32 s33, s7, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s36, s7, 24
-; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX7-HSA-NEXT: s_ashr_i32 s34, s7, 24
+; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3
+; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[26:27], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[24:25], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
@@ -7150,73 +7153,70 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX7-HSA-NEXT: s_add_u32 s26, s0, 0x70
-; GFX7-HSA-NEXT: s_addc_u32 s27, s1, 0
+; GFX7-HSA-NEXT: s_add_u32 s24, s0, 0x50
+; GFX7-HSA-NEXT: s_addc_u32 s25, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s24
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s25
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10
-; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x60
-; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11
-; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13
-; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s11
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s36
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33
-; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x50
; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
-; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
-; GFX7-HSA-NEXT: s_add_u32 s10, s0, 64
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
-; GFX7-HSA-NEXT: s_add_u32 s10, s0, 48
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s29
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
-; GFX7-HSA-NEXT: s_add_u32 s10, s0, 32
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_nop 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_endpgm
@@ -7225,107 +7225,109 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX8-NOHSA: ; %bb.0:
; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s7, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s7, 8
-; GFX8-NOHSA-NEXT: s_mov_b32 s22, s7
-; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s6, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s6, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s6, 8
-; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s5, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s5, 8
-; GFX8-NOHSA-NEXT: s_mov_b32 s14, s5
-; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s4, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s4, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s4, 8
-; GFX8-NOHSA-NEXT: s_ashr_i32 s19, s5, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s31, s5, 24
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s10, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s10, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s10, 8
+; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s8, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s8, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s8, 8
+; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s11, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s11, 8
+; GFX8-NOHSA-NEXT: s_mov_b32 s28, s11
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s9, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s9, 8
+; GFX8-NOHSA-NEXT: s_mov_b32 s2, s9
+; GFX8-NOHSA-NEXT: s_ashr_i32 s25, s9, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s29, s9, 24
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[8:9], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[10:11], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[28:29], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX8-NOHSA-NEXT: s_ashr_i32 s28, s11, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s11, 24
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[24:25], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GFX8-NOHSA-NEXT: s_ashr_i32 s30, s7, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s7, 24
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[18:19], 0x80000
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x70
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s33
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s30
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x60
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
+; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 0x50
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13
+; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
+; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 64
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s35
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
+; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 16
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x50
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s23
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: s_nop 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11
+; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s33
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s28
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: s_nop 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 0x60
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 64
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: s_nop 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s35
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s29
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s29
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_nop 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 48
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s31
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s19
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 32
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -7435,64 +7437,64 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s8, s7, 16
-; GFX12-NEXT: s_lshr_b32 s10, s7, 8
-; GFX12-NEXT: s_mov_b32 s12, s7
-; GFX12-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
-; GFX12-NEXT: s_ashr_i32 s33, s7, 31
-; GFX12-NEXT: s_ashr_i32 s36, s7, 24
+; GFX12-NEXT: s_lshr_b32 s2, s6, 16
+; GFX12-NEXT: s_lshr_b32 s8, s6, 24
+; GFX12-NEXT: s_lshr_b32 s10, s6, 8
+; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000
; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GFX12-NEXT: s_lshr_b32 s14, s6, 16
-; GFX12-NEXT: s_lshr_b32 s16, s6, 24
-; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GFX12-NEXT: s_lshr_b32 s12, s4, 16
+; GFX12-NEXT: s_lshr_b32 s14, s4, 24
+; GFX12-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v3, s33
-; GFX12-NEXT: s_lshr_b32 s18, s6, 8
-; GFX12-NEXT: v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v5, s35
-; GFX12-NEXT: v_dual_mov_b32 v4, s34 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v9, s13
-; GFX12-NEXT: s_lshr_b32 s20, s5, 16
-; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s35
+; GFX12-NEXT: s_lshr_b32 s16, s4, 8
+; GFX12-NEXT: v_dual_mov_b32 v4, s30 :: v_dual_mov_b32 v9, s3
+; GFX12-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s9
+; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v3, s11
+; GFX12-NEXT: s_lshr_b32 s18, s7, 16
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v11, s11
-; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s15
-; GFX12-NEXT: s_lshr_b32 s22, s5, 8
-; GFX12-NEXT: s_mov_b32 s24, s5
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v5, s31
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v13, s13
+; GFX12-NEXT: s_lshr_b32 s20, s7, 8
+; GFX12-NEXT: s_mov_b32 s22, s7
+; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GFX12-NEXT: s_lshr_b32 s24, s5, 16
+; GFX12-NEXT: s_ashr_i32 s33, s7, 31
+; GFX12-NEXT: s_ashr_i32 s36, s7, 24
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GFX12-NEXT: s_lshr_b32 s26, s4, 16
-; GFX12-NEXT: s_lshr_b32 s28, s4, 24
-; GFX12-NEXT: s_ashr_i32 s29, s5, 31
-; GFX12-NEXT: s_ashr_i32 s31, s5, 24
+; GFX12-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15
+; GFX12-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v7, s17
+; GFX12-NEXT: s_lshr_b32 s26, s5, 8
+; GFX12-NEXT: s_mov_b32 s28, s5
+; GFX12-NEXT: s_ashr_i32 s27, s5, 31
+; GFX12-NEXT: s_ashr_i32 s29, s5, 24
+; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v15, s17
-; GFX12-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v7, s19
-; GFX12-NEXT: s_lshr_b32 s30, s4, 8
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
-; GFX12-NEXT: v_mov_b32_e32 v6, s18
-; GFX12-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64
+; GFX12-NEXT: v_dual_mov_b32 v0, s18 :: v_dual_mov_b32 v3, s33
+; GFX12-NEXT: v_dual_mov_b32 v1, s19 :: v_dual_mov_b32 v2, s36
+; GFX12-NEXT: v_mov_b32_e32 v9, s23
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[26:27], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v11, s21
+; GFX12-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v17, s25
+; GFX12-NEXT: v_dual_mov_b32 v16, s24 :: v_dual_mov_b32 v19, s27
+; GFX12-NEXT: v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v21, s5
+; GFX12-NEXT: v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v23, s7
+; GFX12-NEXT: v_mov_b32_e32 v22, s6
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1]
; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112
; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:96
-; GFX12-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s29
-; GFX12-NEXT: v_dual_mov_b32 v1, s21 :: v_dual_mov_b32 v2, s31
-; GFX12-NEXT: v_mov_b32_e32 v9, s25
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v11, s23
-; GFX12-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v17, s27
-; GFX12-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v19, s7
-; GFX12-NEXT: v_dual_mov_b32 v18, s6 :: v_dual_mov_b32 v21, s3
-; GFX12-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v23, s5
-; GFX12-NEXT: v_mov_b32_e32 v22, s4
-; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1]
+; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:32
; GFX12-NEXT: s_endpgm
%load = load <16 x i8>, ptr addrspace(4) %in
%ext = sext <16 x i8> %load to <16 x i64>
@@ -8204,157 +8206,157 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s7, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s7, 8
-; GFX6-NOHSA-NEXT: s_mov_b32 s30, s7
; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s6, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s6, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s6, 8
-; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s5, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s5, 8
-; GFX6-NOHSA-NEXT: s_mov_b32 s34, s5
-; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s2, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s2, 24
+; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s2, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s0, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s0, 24
+; GFX6-NOHSA-NEXT: s_mov_b32 s34, s7
; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s1, 31
; GFX6-NOHSA-NEXT: s_ashr_i32 s13, s1, 24
-; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s3, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s3, 24
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[34:35], 0x80000
-; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s5, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s5, 24
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[30:31], 0x80000
-; GFX6-NOHSA-NEXT: s_ashr_i32 s25, s7, 31
-; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s7, 24
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[14:15], 0x80000
-; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s3, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s3, 8
-; GFX6-NOHSA-NEXT: s_mov_b32 s44, s3
-; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s2, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s2, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s2, 8
-; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s1, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s1, 8
-; GFX6-NOHSA-NEXT: s_mov_b32 s52, s1
-; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s0, 16
-; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s0, 24
-; GFX6-NOHSA-NEXT: s_lshr_b32 s54, s0, 8
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[0:1], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[4:5], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x80000
+; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s3, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s3, 24
+; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s5, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s49, s5, 24
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[34:35], 0x80000
+; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s7, 31
+; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s7, 24
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[30:31], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[28:29], 0x80000
+; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s0, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s7, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 8
+; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s5, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s5, 8
+; GFX6-NOHSA-NEXT: s_mov_b32 s46, s5
+; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s3, 16
+; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s3, 8
+; GFX6-NOHSA-NEXT: s_mov_b32 s50, s3
+; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s1, 16
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[0:1], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[4:5], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[6:7], 0x80000
+; GFX6-NOHSA-NEXT: s_lshr_b32 s4, s1, 8
+; GFX6-NOHSA-NEXT: s_mov_b32 s6, s1
; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8
; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s58
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s59
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s56
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s57
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s62
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s63
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s60
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s61
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s27
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s23
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s21
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s19
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s17
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s60
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s61
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s58
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s59
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s2
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s3
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s56
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s57
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s38
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s39
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s54
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s55
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s52
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s53
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s21
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s19
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000
-; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s6
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:240
; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
+; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s8
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:208
; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s22
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s23
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s7
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s24
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s25
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s8
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s9
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:208
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[54:55], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[50:51], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[48:49], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s9
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s22
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s23
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:144
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[50:51], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[46:47], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[44:45], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[42:43], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[40:41], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[40:41], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x80000
-; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[10:11], 0x80000
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[16:17], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[14:15], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[12:13], 0x80000
+; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[10:11], 0x80000
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s44
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s45
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s42
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s43
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s40
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s41
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s16
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s17
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:144
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s46
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s47
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s44
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s45
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s13
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s11
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s20
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s21
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s52
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s53
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s30
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s31
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:112
-; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1)
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s14
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s15
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s34
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s49
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s42
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s43
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s40
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s41
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s18
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s19
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s24
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s25
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s28
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s29
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s17
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s15
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s30
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s31
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:240
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s34
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s35
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s13
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s11
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s38
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s39
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s28
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s29
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s20
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s21
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s38
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s39
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:160
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s26
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s27
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s24
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s25
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s22
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s23
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s9
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s7
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s8
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s9
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s7
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GFX6-NOHSA-NEXT: s_endpgm
;
; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64:
@@ -8366,211 +8368,212 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s40, s7, 8
-; GFX7-HSA-NEXT: s_mov_b32 s42, s7
-; GFX7-HSA-NEXT: s_lshr_b32 s44, s6, 16
-; GFX7-HSA-NEXT: s_ashr_i32 s41, s1, 24
-; GFX7-HSA-NEXT: s_ashr_i32 s43, s3, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s45, s3, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s48, s6, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s50, s6, 8
-; GFX7-HSA-NEXT: s_lshr_b32 s52, s5, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s46, s5, 8
-; GFX7-HSA-NEXT: s_mov_b32 s54, s5
-; GFX7-HSA-NEXT: s_lshr_b32 s38, s4, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s36, s4, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s34, s4, 8
-; GFX7-HSA-NEXT: s_lshr_b32 s28, s3, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s24, s3, 8
-; GFX7-HSA-NEXT: s_mov_b32 s26, s3
-; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s20, s2, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s18, s2, 8
-; GFX7-HSA-NEXT: s_lshr_b32 s14, s1, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s56, s1, 8
-; GFX7-HSA-NEXT: s_mov_b32 s12, s1
-; GFX7-HSA-NEXT: s_lshr_b32 s58, s0, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s60, s0, 24
-; GFX7-HSA-NEXT: s_lshr_b32 s62, s0, 8
-; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[10:11], 0x80000
+; GFX7-HSA-NEXT: s_lshr_b32 s12, s6, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 24
; GFX7-HSA-NEXT: s_ashr_i32 s33, s1, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s66, s5, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s67, s5, 24
-; GFX7-HSA-NEXT: s_ashr_i32 s68, s7, 31
-; GFX7-HSA-NEXT: s_ashr_i32 s69, s7, 24
-; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[64:65], s[6:7], 0x80000
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[60:61], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[58:59], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[56:57], 0x80000
+; GFX7-HSA-NEXT: s_ashr_i32 s37, s1, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s34, s0, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s30, s0, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s28, s0, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s66, s1, 8
+; GFX7-HSA-NEXT: s_mov_b32 s68, s1
+; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[0:1], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[12:13], 0x80000
+; GFX7-HSA-NEXT: s_lshr_b32 s36, s6, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 16
+; GFX7-HSA-NEXT: s_ashr_i32 s41, s3, 31
+; GFX7-HSA-NEXT: s_lshr_b32 s50, s4, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s52, s4, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s54, s2, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s56, s2, 24
+; GFX7-HSA-NEXT: s_lshr_b32 s42, s2, 8
+; GFX7-HSA-NEXT: s_lshr_b32 s26, s7, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s20, s7, 8
+; GFX7-HSA-NEXT: s_mov_b32 s24, s7
+; GFX7-HSA-NEXT: s_lshr_b32 s18, s5, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s14, s5, 8
+; GFX7-HSA-NEXT: s_mov_b32 s16, s5
+; GFX7-HSA-NEXT: s_lshr_b32 s58, s3, 16
+; GFX7-HSA-NEXT: s_lshr_b32 s60, s3, 8
+; GFX7-HSA-NEXT: s_mov_b32 s62, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x80000
+; GFX7-HSA-NEXT: s_ashr_i32 s44, s3, 24
+; GFX7-HSA-NEXT: s_ashr_i32 s45, s5, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s46, s5, 24
+; GFX7-HSA-NEXT: s_ashr_i32 s47, s7, 31
+; GFX7-HSA-NEXT: s_ashr_i32 s48, s7, 24
+; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[2:3], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[70:71], s[4:5], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[72:73], s[6:7], 0x80000
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[66:67], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[60:61], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[44:45], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[42:43], 0x80000
-; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[40:41], 0x80000
-; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xf0
+; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[40:41], 0x80000
+; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[36:37], 0x80000
+; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xd0
; GFX7-HSA-NEXT: s_addc_u32 s63, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s58
-; GFX7-HSA-NEXT: s_add_u32 s58, s8, 0xe0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s59
-; GFX7-HSA-NEXT: s_addc_u32 s59, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s48
-; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xd0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s49
-; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s48
-; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s49
-; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xc0
-; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s48
-; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s62
-; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s49
-; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xb0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s63
-; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s69
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s68
-; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s58
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s38
-; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0xa0
+; GFX7-HSA-NEXT: s_add_u32 s60, s8, 0xc0
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s61
-; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s59
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39
+; GFX7-HSA-NEXT: s_addc_u32 s61, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s50
+; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x90
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51
+; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50
+; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51
+; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x80
+; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62
+; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s38
+; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0x50
+; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s63
+; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s39
; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s60
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s72
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34
+; GFX7-HSA-NEXT: s_add_u32 s34, s8, 64
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s73
+; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s61
+; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s38
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35
+; GFX7-HSA-NEXT: s_addc_u32 s35, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s58
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s59
+; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s54
+; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s55
+; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s56
+; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s57
+; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s39
; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s56
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s30
-; GFX7-HSA-NEXT: s_add_u32 s30, s8, 0x90
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s31
-; GFX7-HSA-NEXT: s_addc_u32 s31, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s30
-; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s31
-; GFX7-HSA-NEXT: s_add_u32 s30, s8, 0x80
-; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s57
-; GFX7-HSA-NEXT: s_addc_u32 s31, s9, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s64
-; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s28
-; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x70
-; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s65
-; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s50
-; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s51
-; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s48
-; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s29
-; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s52
-; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s53
-; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s49
-; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s67
-; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s66
-; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s38
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s54
-; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s24
-; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x60
-; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s55
-; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s46
-; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s47
-; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s39
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s36
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37
-; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s30
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22
+; GFX7-HSA-NEXT: s_add_u32 s22, s8, 16
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23
+; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s25
-; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s28
-; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s24
-; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s31
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s34
-; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s35
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s26
-; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s45
-; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s43
-; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s27
-; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s29
-; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s25
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7]
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x50
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21
-; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_nop 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16
-; GFX7-HSA-NEXT: s_add_u32 s16, s8, 64
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17
-; GFX7-HSA-NEXT: s_addc_u32 s17, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17
+; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s34
+; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s22
+; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s23
+; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xf0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31
+; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s22
+; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s50
+; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s42
+; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s43
+; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s35
+; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s23
+; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xe0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s70
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s71
+; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s52
+; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s53
+; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s51
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s28
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s29
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
+; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s22
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s48
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s47
+; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s24
+; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s25
+; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s23
+; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s20
+; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s21
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
+; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0xb0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
+; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s46
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s45
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
-; GFX7-HSA-NEXT: s_add_u32 s14, s8, 48
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14
+; GFX7-HSA-NEXT: s_add_u32 s14, s8, 0xa0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15
; GFX7-HSA-NEXT: s_addc_u32 s15, s9, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s41
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10
-; GFX7-HSA-NEXT: s_add_u32 s10, s8, 32
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11
-; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12
+; GFX7-HSA-NEXT: s_add_u32 s12, s8, 0x70
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
+; GFX7-HSA-NEXT: s_addc_u32 s13, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s44
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
+; GFX7-HSA-NEXT: s_add_u32 s6, s8, 0x60
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX7-HSA-NEXT: s_addc_u32 s7, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: s_nop 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-HSA-NEXT: s_add_u32 s4, s8, 48
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5
; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s37
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9
+; GFX7-HSA-NEXT: s_nop 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_endpgm
;
@@ -8580,140 +8583,175 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s50, s7, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s52, s7, 8
-; GFX8-NOHSA-NEXT: s_mov_b32 s54, s7
-; GFX8-NOHSA-NEXT: s_lshr_b32 s56, s6, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s58, s6, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s60, s6, 8
-; GFX8-NOHSA-NEXT: s_lshr_b32 s62, s5, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s5, 8
-; GFX8-NOHSA-NEXT: s_mov_b32 s46, s5
-; GFX8-NOHSA-NEXT: s_lshr_b32 s42, s4, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s4, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s38, s4, 8
-; GFX8-NOHSA-NEXT: s_lshr_b32 s64, s3, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s3, 8
-; GFX8-NOHSA-NEXT: s_mov_b32 s30, s3
-; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s2, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s2, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s2, 8
-; GFX8-NOHSA-NEXT: s_lshr_b32 s66, s1, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s1, 8
-; GFX8-NOHSA-NEXT: s_mov_b32 s18, s1
-; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s0, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s0, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s0, 8
-; GFX8-NOHSA-NEXT: s_ashr_i32 s65, s3, 24
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[2:3], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[44:45], s[6:7], 0x80000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s46, s6, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s6, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s50, s6, 8
+; GFX8-NOHSA-NEXT: s_lshr_b32 s52, s4, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s54, s4, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s56, s4, 8
+; GFX8-NOHSA-NEXT: s_lshr_b32 s58, s2, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s60, s2, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s2, 8
+; GFX8-NOHSA-NEXT: s_lshr_b32 s36, s0, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s0, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s0, 8
+; GFX8-NOHSA-NEXT: s_lshr_b32 s62, s7, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s7, 8
+; GFX8-NOHSA-NEXT: s_mov_b32 s22, s7
+; GFX8-NOHSA-NEXT: s_lshr_b32 s64, s5, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s5, 8
+; GFX8-NOHSA-NEXT: s_mov_b32 s18, s5
+; GFX8-NOHSA-NEXT: s_lshr_b32 s66, s3, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s3, 8
+; GFX8-NOHSA-NEXT: s_mov_b32 s14, s3
+; GFX8-NOHSA-NEXT: s_lshr_b32 s44, s1, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s1, 8
+; GFX8-NOHSA-NEXT: s_mov_b32 s10, s1
+; GFX8-NOHSA-NEXT: s_ashr_i32 s63, s5, 24
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[0:1], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[38:39], s[4:5], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[68:69], s[6:7], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s1, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s42, s1, 24
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[44:45], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GFX8-NOHSA-NEXT: s_ashr_i32 s43, s3, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s44, s3, 24
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[66:67], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GFX8-NOHSA-NEXT: s_ashr_i32 s4, s1, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s1, 24
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[66:67], 0x80000
+; GFX8-NOHSA-NEXT: s_ashr_i32 s45, s5, 31
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GFX8-NOHSA-NEXT: s_ashr_i32 s64, s7, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s65, s7, 24
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[62:63], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s3, 31
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[64:65], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
-; GFX8-NOHSA-NEXT: s_ashr_i32 s64, s5, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s5, s5, 24
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
-; GFX8-NOHSA-NEXT: s_ashr_i32 s66, s7, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s7, s7, 24
; GFX8-NOHSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s50
-; GFX8-NOHSA-NEXT: s_add_u32 s50, s8, 0xf0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s51
-; GFX8-NOHSA-NEXT: s_addc_u32 s51, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s50
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s66
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s51
-; GFX8-NOHSA-NEXT: s_add_u32 s50, s8, 0xe0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s51, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s50
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s54
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s55
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s52
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s53
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s51
-; GFX8-NOHSA-NEXT: s_add_u32 s50, s8, 0xd0
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s46
+; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xd0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47
+; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47
+; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xc0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s68
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s69
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s50
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47
+; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0x90
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s51, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s50
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s56
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s57
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s58
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s59
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s51
+; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s52
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s53
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s54
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s55
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s44
-; GFX8-NOHSA-NEXT: s_add_u32 s44, s8, 0xc0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s45
-; GFX8-NOHSA-NEXT: s_addc_u32 s45, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s44
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s38
+; GFX8-NOHSA-NEXT: s_add_u32 s38, s8, 0x80
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s39
+; GFX8-NOHSA-NEXT: s_addc_u32 s39, s9, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s38
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s56
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s57
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s39
+; GFX8-NOHSA-NEXT: s_add_u32 s38, s8, 0x50
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: s_addc_u32 s39, s9, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s38
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s58
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s59
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s60
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s61
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s45
-; GFX8-NOHSA-NEXT: s_add_u32 s44, s8, 0xb0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s45, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s44
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s62
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s63
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s64
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s45
-; GFX8-NOHSA-NEXT: s_add_u32 s44, s8, 0xa0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s45, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s44
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s46
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s45
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s39
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30
+; GFX8-NOHSA-NEXT: s_add_u32 s30, s8, 64
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31
+; GFX8-NOHSA-NEXT: s_addc_u32 s31, s9, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s40
-; GFX8-NOHSA-NEXT: s_add_u32 s40, s8, 0x90
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41
-; GFX8-NOHSA-NEXT: s_addc_u32 s41, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s40
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s42
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s43
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s41
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s31
+; GFX8-NOHSA-NEXT: s_add_u32 s30, s8, 16
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_nop 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s31, s9, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s36
-; GFX8-NOHSA-NEXT: s_add_u32 s36, s8, 0x80
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s37
-; GFX8-NOHSA-NEXT: s_addc_u32 s37, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s36
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s38
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s37
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s34
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s31
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s27
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s29
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: s_nop 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xf0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s65
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s64
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xe0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: s_nop 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0xb0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s63
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0xa0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
@@ -8723,33 +8761,15 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x60
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s65
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s33
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x50
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s34
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 64
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s28
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s44
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -8760,32 +8780,16 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s42
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s33
; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -8984,122 +8988,120 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s40, s7, 16
-; GFX12-NEXT: s_lshr_b32 s50, s6, 8
-; GFX12-NEXT: s_lshr_b32 s62, s3, 16
-; GFX12-NEXT: s_ashr_i32 s51, s3, 24
-; GFX12-NEXT: s_lshr_b32 s42, s7, 8
-; GFX12-NEXT: s_mov_b32 s44, s7
-; GFX12-NEXT: s_lshr_b32 s46, s6, 16
-; GFX12-NEXT: s_lshr_b32 s48, s6, 24
-; GFX12-NEXT: s_lshr_b32 s38, s5, 16
-; GFX12-NEXT: s_lshr_b32 s52, s5, 8
-; GFX12-NEXT: s_mov_b32 s54, s5
-; GFX12-NEXT: s_lshr_b32 s56, s4, 16
-; GFX12-NEXT: s_lshr_b32 s58, s4, 24
-; GFX12-NEXT: s_lshr_b32 s60, s4, 8
-; GFX12-NEXT: s_lshr_b32 s36, s3, 8
-; GFX12-NEXT: s_mov_b32 s34, s3
-; GFX12-NEXT: s_lshr_b32 s28, s2, 16
-; GFX12-NEXT: s_lshr_b32 s26, s2, 24
-; GFX12-NEXT: s_lshr_b32 s24, s2, 8
-; GFX12-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000
+; GFX12-NEXT: s_lshr_b32 s34, s6, 16
+; GFX12-NEXT: s_lshr_b32 s36, s6, 24
+; GFX12-NEXT: s_lshr_b32 s38, s6, 8
+; GFX12-NEXT: s_lshr_b32 s40, s4, 16
+; GFX12-NEXT: s_lshr_b32 s42, s4, 24
+; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
+; GFX12-NEXT: s_lshr_b32 s44, s4, 8
; GFX12-NEXT: s_bfe_i64 s[66:67], s[6:7], 0x80000
-; GFX12-NEXT: s_ashr_i32 s39, s3, 31
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x80000
-; GFX12-NEXT: s_ashr_i32 s62, s5, 31
-; GFX12-NEXT: s_ashr_i32 s63, s5, 24
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x80000
-; GFX12-NEXT: s_ashr_i32 s50, s7, 31
+; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s35
+; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GFX12-NEXT: s_ashr_i32 s7, s7, 24
+; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s37
+; GFX12-NEXT: v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v5, s67
+; GFX12-NEXT: s_lshr_b32 s28, s2, 16
+; GFX12-NEXT: s_lshr_b32 s46, s2, 24
+; GFX12-NEXT: s_bfe_i64 s[64:65], s[4:5], 0x80000
; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s41
-; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v4, s66 :: v_dual_mov_b32 v7, s39
+; GFX12-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v9, s41
+; GFX12-NEXT: s_lshr_b32 s48, s2, 8
+; GFX12-NEXT: v_dual_mov_b32 v8, s40 :: v_dual_mov_b32 v11, s43
+; GFX12-NEXT: v_dual_mov_b32 v10, s42 :: v_dual_mov_b32 v13, s65
+; GFX12-NEXT: s_lshr_b32 s50, s0, 16
+; GFX12-NEXT: s_lshr_b32 s52, s0, 24
; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v3, s50
-; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s45
-; GFX12-NEXT: v_dual_mov_b32 v4, s44 :: v_dual_mov_b32 v7, s43
-; GFX12-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v9, s47
-; GFX12-NEXT: v_dual_mov_b32 v8, s46 :: v_dual_mov_b32 v11, s49
-; GFX12-NEXT: v_dual_mov_b32 v10, s48 :: v_dual_mov_b32 v13, s67
-; GFX12-NEXT: v_dual_mov_b32 v12, s66 :: v_dual_mov_b32 v15, s5
-; GFX12-NEXT: v_mov_b32_e32 v14, s4
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[38:39], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v12, s64 :: v_dual_mov_b32 v15, s45
+; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GFX12-NEXT: v_mov_b32_e32 v14, s44
+; GFX12-NEXT: s_lshr_b32 s54, s0, 8
+; GFX12-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
+; GFX12-NEXT: s_lshr_b32 s56, s7, 16
+; GFX12-NEXT: s_lshr_b32 s58, s5, 16
+; GFX12-NEXT: s_lshr_b32 s60, s1, 8
+; GFX12-NEXT: s_mov_b32 s62, s1
+; GFX12-NEXT: s_ashr_i32 s57, s1, 24
+; GFX12-NEXT: s_ashr_i32 s59, s3, 31
+; GFX12-NEXT: s_ashr_i32 s61, s3, 24
+; GFX12-NEXT: s_ashr_i32 s63, s5, 31
; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:240
-; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:224
-; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:208
-; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:192
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s62
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s63
-; GFX12-NEXT: v_mov_b32_e32 v5, s55
-; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v4, s54 :: v_dual_mov_b32 v7, s53
-; GFX12-NEXT: v_dual_mov_b32 v6, s52 :: v_dual_mov_b32 v9, s57
-; GFX12-NEXT: v_dual_mov_b32 v8, s56 :: v_dual_mov_b32 v11, s59
-; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s31
-; GFX12-NEXT: s_lshr_b32 s22, s1, 16
-; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v12, s30 :: v_dual_mov_b32 v15, s61
-; GFX12-NEXT: v_dual_mov_b32 v14, s60 :: v_dual_mov_b32 v17, s3
-; GFX12-NEXT: s_lshr_b32 s16, s1, 8
-; GFX12-NEXT: s_mov_b32 s18, s1
-; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s39
-; GFX12-NEXT: v_dual_mov_b32 v18, s51 :: v_dual_mov_b32 v21, s35
-; GFX12-NEXT: s_lshr_b32 s14, s0, 16
-; GFX12-NEXT: s_lshr_b32 s12, s0, 24
-; GFX12-NEXT: s_ashr_i32 s6, s1, 31
-; GFX12-NEXT: s_ashr_i32 s33, s1, 24
-; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v20, s34 :: v_dual_mov_b32 v23, s37
-; GFX12-NEXT: v_mov_b32_e32 v22, s36
-; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:176
-; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:160
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:208
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:192
; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:144
; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:128
-; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:112
-; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:96
-; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s27
-; GFX12-NEXT: v_dual_mov_b32 v1, s29 :: v_dual_mov_b32 v2, s26
-; GFX12-NEXT: v_mov_b32_e32 v5, s21
-; GFX12-NEXT: s_lshr_b32 s64, s0, 8
+; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s47
+; GFX12-NEXT: v_dual_mov_b32 v1, s29 :: v_dual_mov_b32 v2, s46
+; GFX12-NEXT: v_mov_b32_e32 v5, s31
+; GFX12-NEXT: s_lshr_b32 s26, s7, 8
+; GFX12-NEXT: s_mov_b32 s24, s7
+; GFX12-NEXT: s_bfe_i64 s[22:23], s[0:1], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v4, s30 :: v_dual_mov_b32 v7, s49
+; GFX12-NEXT: v_dual_mov_b32 v6, s48 :: v_dual_mov_b32 v9, s51
+; GFX12-NEXT: s_lshr_b32 s18, s5, 8
+; GFX12-NEXT: s_mov_b32 s20, s5
+; GFX12-NEXT: s_lshr_b32 s16, s3, 16
+; GFX12-NEXT: s_lshr_b32 s12, s3, 8
+; GFX12-NEXT: s_mov_b32 s14, s3
+; GFX12-NEXT: s_lshr_b32 s10, s1, 16
+; GFX12-NEXT: s_ashr_i32 s33, s1, 31
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[60:61], 0x80000
+; GFX12-NEXT: s_ashr_i32 s60, s5, 24
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[58:59], 0x80000
+; GFX12-NEXT: s_ashr_i32 s58, s7, 31
+; GFX12-NEXT: s_ashr_i32 s62, s7, 24
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v8, s50 :: v_dual_mov_b32 v11, s53
+; GFX12-NEXT: v_dual_mov_b32 v10, s52 :: v_dual_mov_b32 v13, s23
+; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v15, s55
+; GFX12-NEXT: v_dual_mov_b32 v14, s54 :: v_dual_mov_b32 v17, s7
+; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v19, s58
+; GFX12-NEXT: v_dual_mov_b32 v18, s62 :: v_dual_mov_b32 v21, s25
; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v7, s25
-; GFX12-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v9, s23
-; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v11, s6
-; GFX12-NEXT: v_dual_mov_b32 v10, s33 :: v_dual_mov_b32 v13, s19
-; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[0:1], s[64:65], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v12, s18 :: v_dual_mov_b32 v15, s17
-; GFX12-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v17, s15
-; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v19, s13
-; GFX12-NEXT: v_dual_mov_b32 v18, s12 :: v_dual_mov_b32 v21, s11
-; GFX12-NEXT: v_dual_mov_b32 v20, s10 :: v_dual_mov_b32 v23, s1
-; GFX12-NEXT: v_mov_b32_e32 v22, s0
+; GFX12-NEXT: v_dual_mov_b32 v20, s24 :: v_dual_mov_b32 v23, s27
+; GFX12-NEXT: v_mov_b32_e32 v22, s26
; GFX12-NEXT: s_clause 0x5
; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:80
; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:64
-; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:48
-; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:32
-; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:16
-; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9]
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:16
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9]
+; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:240
+; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:224
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s63
+; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s60
+; GFX12-NEXT: v_mov_b32_e32 v5, s21
+; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v7, s19
+; GFX12-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s17
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v11, s59
+; GFX12-NEXT: v_dual_mov_b32 v10, s61 :: v_dual_mov_b32 v13, s15
+; GFX12-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v15, s13
+; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v17, s11
+; GFX12-NEXT: v_dual_mov_b32 v16, s10 :: v_dual_mov_b32 v19, s33
+; GFX12-NEXT: v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v21, s3
+; GFX12-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v23, s1
+; GFX12-NEXT: v_mov_b32_e32 v22, s0
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:176
+; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:160
+; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:112
+; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:96
+; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:48
+; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:32
; GFX12-NEXT: s_endpgm
%load = load <32 x i8>, ptr addrspace(4) %in
%ext = sext <32 x i8> %load to <32 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 3753737d251e4..771ea8c8a6ec1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -1738,8 +1738,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v4
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v5
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v4
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v5, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v4, 0, 16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -1758,8 +1758,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v5
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4
; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16
; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
@@ -6365,8 +6365,8 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v7, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v6, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v6, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
@@ -6390,28 +6390,28 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
-; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3
+; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v8, v3
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1
; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v14, v5, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3
@@ -6420,9 +6420,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
@@ -6964,58 +6964,59 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
-; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v3
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v7
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v3
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v1
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v1
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v1, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v3
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v3
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v2, 0, 16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v7
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v5
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v5
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v5, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v7
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v7
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v6, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v12, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v4, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v18, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v0, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v18, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v17, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v9, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v7
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v7
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v5
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v1
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v5
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v5, 0, 16
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v6, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v1, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v1
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v1, 0, 16
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v2, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v1, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64:
@@ -7037,31 +7038,31 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 64
-; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1
+; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16
@@ -7069,36 +7070,36 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v5
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7
-; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6
+; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11]
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6
; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v4
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v9, v16, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v9, v18, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7
-; GCN-HSA-NEXT: v_bfe_i32 v6, v17, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v6, v19, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[7:10]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[7:10]
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
; GCN-HSA-NEXT: v_mov_b32_e32 v15, v3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v0
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v0
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1
; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v2, v16, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v2, v18, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v8, v15, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3
@@ -7107,9 +7108,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15]
; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
@@ -8100,113 +8101,115 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
-; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
-; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
+; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3)
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v15
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v3
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v3
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v7
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v11
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v15
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v22, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v3
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v3
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v7
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v23, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v15
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v15
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:240
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v1
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v1
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v1, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v12
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v13
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v13
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v13, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v11
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v27, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v1
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v1
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v1, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v22, 0, 16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v26, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v7
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v7
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v5
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v5
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v5, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v11
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v11
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v27, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v12, 0, 16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v9
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v9
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v9, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v14, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v21, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v20, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v1, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v8, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v15
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v15
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v21, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v13
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v13
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v13, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v12, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v14, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v20, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v2, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v9
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v9, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v10, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:224
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v8, 0, 16
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:192
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v10, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v19, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v18, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v17, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v16, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v4, 0, 16
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v6, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v2, 0, 16
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v0, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v27, v2, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v4, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v6, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v19, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v18, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v17, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v16, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v0, 0, 16
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v2, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v0, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v25
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 31, v28
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v28, 31, v27
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v26
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 31, v30
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64:
@@ -8218,180 +8221,179 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
+; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s4, s2, 48
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
-; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
-; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
-; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v5
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v5
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7
-; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19]
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3
-; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v7
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v7
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: s_waitcnt vmcnt(4)
-; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v1
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v1
+; GCN-HSA-NEXT: v_bfe_i32 v16, v13, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v13
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3
; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19]
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2
-; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v3
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19]
; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3
-; GCN-HSA-NEXT: s_waitcnt vmcnt(5)
-; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v9
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v9
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, v11
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0
+; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v16, v14, 0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v11
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18
; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19]
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, v15
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0
+; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v15
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[13:16]
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v12
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0
+; GCN-HSA-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
+; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_waitcnt vmcnt(6)
-; GCN-HSA-NEXT: v_bfe_i32 v16, v13, 0, 16
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v13
+; GCN-HSA-NEXT: v_bfe_i32 v12, v9, 0, 16
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v10
+; GCN-HSA-NEXT: v_mov_b32_e32 v29, v11
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v8
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v9
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; GCN-HSA-NEXT: v_bfe_i32 v16, v29, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v14, v18, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v8, v10, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v10, v28, 0, 16
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v11
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GCN-HSA-NEXT: s_waitcnt vmcnt(5)
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v2
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19]
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v15
-; GCN-HSA-NEXT: v_bfe_i32 v18, v1, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, v15
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15]
+; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v10, v20, 0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, v3
+; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT: v_bfe_i32 v8, v21, 0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v0
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v4
+; GCN-HSA-NEXT: v_bfe_i32 v12, v4, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v19, v1, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v15
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[8:11]
+; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v5
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v5
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT: v_bfe_i32 v24, v0, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v26, v26, 0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GCN-HSA-NEXT: v_bfe_i32 v14, v27, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v26
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27]
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT: v_bfe_i32 v20, v1, 0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v1
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v20
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[20:23]
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, v7
+; GCN-HSA-NEXT: v_bfe_i32 v18, v18, 0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2
-; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[19:22]
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; GCN-HSA-NEXT: v_bfe_i32 v0, v19, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v10
-; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[16:19]
-; GCN-HSA-NEXT: v_bfe_i32 v23, v2, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v19, v10, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v2, v25, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v25, v9, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0
-; GCN-HSA-NEXT: v_bfe_i32 v13, v4, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v15, v3, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s1
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v25
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14
-; GCN-HSA-NEXT: v_bfe_i32 v11, v14, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0
-; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[23:26]
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3
-; GCN-HSA-NEXT: flat_store_dwordx4 v[3:4], v[13:16]
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GCN-HSA-NEXT: v_bfe_i32 v13, v1, 0, 16
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19]
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v2, 16, v7
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v8
-; GCN-HSA-NEXT: v_bfe_i32 v21, v21, 0, 16
-; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; GCN-HSA-NEXT: v_bfe_i32 v17, v22, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v21
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 0, 16
-; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18]
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v12
-; GCN-HSA-NEXT: v_bfe_i32 v5, v12, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: s_add_u32 s0, s0, 64
-; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14]
+; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index 5bc02c4d63181..1ccc019668b0d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -6274,12 +6274,12 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 8
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s4, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s4, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s5, 8
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s5, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 8
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x80000
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s4, 31
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s4, 24
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000
@@ -6294,19 +6294,19 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s9
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s12
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s13
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s10
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s11
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s5
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v8i8_to_v8i64:
@@ -6325,11 +6325,12 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v1
; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v0
; GCN-HSA-NEXT: s_lshr_b32 s4, s2, 16
-; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 8
-; GCN-HSA-NEXT: s_lshr_b32 s8, s3, 16
-; GCN-HSA-NEXT: s_lshr_b32 s10, s3, 24
-; GCN-HSA-NEXT: s_lshr_b32 s12, s3, 8
+; GCN-HSA-NEXT: s_lshr_b32 s6, s3, 16
+; GCN-HSA-NEXT: s_lshr_b32 s8, s3, 24
+; GCN-HSA-NEXT: s_lshr_b32 s10, s3, 8
+; GCN-HSA-NEXT: s_lshr_b32 s12, s2, 8
; GCN-HSA-NEXT: s_ashr_i32 s13, s2, 31
+; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8
; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x80000
; GCN-HSA-NEXT: s_ashr_i32 s16, s2, 24
; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x80000
@@ -6337,38 +6338,37 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: s_add_u32 s4, s0, 48
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, s5
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
+; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v18, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7]
-; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
+; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s15
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s9
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s10
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s11
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v8i8_to_v8i64:
@@ -6388,10 +6388,10 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v0
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s4, 8
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 24
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s5, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 24
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 8
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s15, s4, 31
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s18, s4, 24
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
@@ -6408,18 +6408,18 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s8
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s9
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s11
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s12
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s13
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s11
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s5
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v8i8_to_v8i64:
@@ -6934,84 +6934,85 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s9, v1
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v0
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 8
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s5
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s4, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s4, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s9, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s9, 8
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s9
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s8, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s8, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s8, 8
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s4, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s6, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 8
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s5
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s7, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s7, 8
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s7, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s7, 24
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s9, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s9, 24
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s5, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 24
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[30:31], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s15
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s36
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s37
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s33
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s31
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s26
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s27
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s33
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s26
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s27
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s12
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s13
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s14
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s15
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s31
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s19
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s21
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s24
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s25
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s9
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s22
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s10
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s11
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s7
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v16i8_to_v16i64:
@@ -7024,41 +7025,41 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3
; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2
-; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1
+; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3
; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0
-; GCN-HSA-NEXT: s_lshr_b32 s6, s3, 16
-; GCN-HSA-NEXT: s_lshr_b32 s8, s3, 8
-; GCN-HSA-NEXT: s_mov_b32 s10, s3
-; GCN-HSA-NEXT: s_lshr_b32 s12, s2, 16
-; GCN-HSA-NEXT: s_lshr_b32 s14, s2, 24
-; GCN-HSA-NEXT: s_lshr_b32 s16, s2, 8
+; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1
+; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 16
+; GCN-HSA-NEXT: s_lshr_b32 s8, s2, 24
+; GCN-HSA-NEXT: s_lshr_b32 s10, s2, 8
+; GCN-HSA-NEXT: s_lshr_b32 s18, s3, 16
+; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 8
+; GCN-HSA-NEXT: s_mov_b32 s22, s3
; GCN-HSA-NEXT: s_ashr_i32 s7, s3, 31
; GCN-HSA-NEXT: s_ashr_i32 s9, s3, 24
; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x80000
-; GCN-HSA-NEXT: s_lshr_b32 s18, s5, 16
-; GCN-HSA-NEXT: s_lshr_b32 s20, s5, 8
-; GCN-HSA-NEXT: s_mov_b32 s22, s5
+; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16
+; GCN-HSA-NEXT: s_lshr_b32 s14, s4, 24
+; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: s_lshr_b32 s2, s4, 16
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000
+; GCN-HSA-NEXT: s_ashr_i32 s4, s5, 24
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT: s_lshr_b32 s2, s5, 16
; GCN-HSA-NEXT: s_ashr_i32 s3, s5, 31
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
-; GCN-HSA-NEXT: s_ashr_i32 s7, s5, 24
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24
-; GCN-HSA-NEXT: s_lshr_b32 s24, s4, 24
-; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 8
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s9
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25
-; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4
+; GCN-HSA-NEXT: s_lshr_b32 s4, s5, 8
+; GCN-HSA-NEXT: s_mov_b32 s24, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
@@ -7069,66 +7070,66 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6
-; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x70
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7
-; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6
-; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x60
-; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[2:5]
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6
; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[2:5]
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
; GCN-HSA-NEXT: s_add_u32 s6, s0, 64
-; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[2:5]
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s13
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s14
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s15
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6
-; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s18
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15
+; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s18
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7]
-; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
+; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s20
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s21
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v16i8_to_v16i64:
@@ -7142,83 +7143,84 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s5, 16
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v1
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v0
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s5, 8
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s20, s5
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s4, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s4, 24
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s4, 8
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s35, s5, 31
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s5, 24
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s4, 24
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s4, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s6, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s6, 24
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s6, 8
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s9, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s9, 8
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, s9
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s8, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s8, 24
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s8, 8
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s5, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 8
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s5
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s7, 8
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s30, s7
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s9, 31
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s29, s7, 31
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s7, 24
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s38
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s9, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[30:31], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s15
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s5, 31
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s5, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s36
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s37
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s20
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s21
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s18
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s19
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s26
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s27
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s33
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s31
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s28
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s29
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s12
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s14
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s15
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s13
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s5
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s34
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s35
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s20
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s21
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s22
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s23
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s24
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s25
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s38
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s33
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s31
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s29
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s26
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s27
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s9
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: s_nop 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s5
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v16i8_to_v16i64:
@@ -8174,166 +8176,166 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s39, v3
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s38, v2
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s37, v1
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s36, v0
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v2
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s19, v3
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s22, v0
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s23, v1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v7
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v6
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s11, v5
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v4
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s39, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s39, 8
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s39
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s38, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s38, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s38, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s37, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s37, 8
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s37
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s36, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s36, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s36, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s7, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s7, 8
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s7
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s39, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s39, 24
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[38:39], 0x80000
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s24, v6
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s25, v7
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s12, v4
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s13, v5
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s18, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s18, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s18, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s22, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s22, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s22, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s24, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s24, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s24, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s12, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s12, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s12, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s19, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[18:19], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[22:23], 0x80000
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s6, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[24:25], 0x80000
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s44
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s45
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s6, 24
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s9
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s37, 31
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s5
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s37, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s19, 8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s42
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s43
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s19
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s40
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s23, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s40
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s41
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s6, 8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s36
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s37
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s11, 16
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s38
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s39
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s23, 8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s23
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[12:13], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s30
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s31
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s11, 8
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s25, 16
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s34, s11
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s25, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s25, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s23, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s23, 24
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s19, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s19, 24
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s25, 24
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s25
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s28
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s29
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s26
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s27
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s10, 24
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s10, 8
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s13, 16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s26
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s27
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s13, 8
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s7, 31
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s7, 24
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s9
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s11, 31
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s5
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s11, 24
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[10:11], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[6:7], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[22:23], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[24:25], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x80000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s13, 31
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s13, 24
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s13
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[12:13], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[22:23], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[28:29], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[44:45], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[36:37], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[40:41], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[42:43], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[38:39], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[40:41], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s48
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s49
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s46
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s47
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s9
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s41
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s39
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s9
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s10
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s11
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s12
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s13
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s15
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s14
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s15
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s41
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s42
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s43
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s44
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s33
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s16
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s17
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s34
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s35
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s18
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s19
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s30
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s31
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s21
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s19
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s46
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s45
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s36
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s37
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s30
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s31
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s28
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s29
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s27
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s24
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s25
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s6
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v32i8_to_v32i64:
@@ -8346,225 +8348,223 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
-; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[0:1]
+; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1]
+; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v9
-; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v8
-; GCN-HSA-NEXT: v_readfirstlane_b32 s9, v7
-; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v6
-; GCN-HSA-NEXT: s_lshr_b32 s16, s7, 16
-; GCN-HSA-NEXT: s_lshr_b32 s18, s7, 8
-; GCN-HSA-NEXT: s_mov_b32 s24, s7
-; GCN-HSA-NEXT: s_lshr_b32 s22, s6, 16
-; GCN-HSA-NEXT: s_lshr_b32 s14, s6, 24
-; GCN-HSA-NEXT: s_lshr_b32 s2, s6, 8
-; GCN-HSA-NEXT: s_lshr_b32 s4, s9, 16
-; GCN-HSA-NEXT: s_lshr_b32 s10, s9, 8
-; GCN-HSA-NEXT: s_mov_b32 s12, s9
-; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[6:7], 0x80000
-; GCN-HSA-NEXT: s_lshr_b32 s6, s8, 16
-; GCN-HSA-NEXT: s_lshr_b32 s28, s8, 24
-; GCN-HSA-NEXT: s_lshr_b32 s30, s8, 8
-; GCN-HSA-NEXT: s_ashr_i32 s43, s9, 31
-; GCN-HSA-NEXT: s_ashr_i32 s52, s9, 24
-; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[8:9], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[16:17], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[24:25], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
+; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v6
+; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v4
+; GCN-HSA-NEXT: v_readfirstlane_b32 s9, v5
+; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v7
+; GCN-HSA-NEXT: s_lshr_b32 s20, s6, 16
+; GCN-HSA-NEXT: s_lshr_b32 s18, s6, 24
+; GCN-HSA-NEXT: s_lshr_b32 s10, s8, 16
+; GCN-HSA-NEXT: s_lshr_b32 s2, s8, 24
+; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x80000
+; GCN-HSA-NEXT: s_lshr_b32 s16, s6, 8
+; GCN-HSA-NEXT: s_lshr_b32 s4, s8, 8
+; GCN-HSA-NEXT: s_lshr_b32 s12, s7, 16
+; GCN-HSA-NEXT: s_lshr_b32 s14, s7, 8
+; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000
+; GCN-HSA-NEXT: s_lshr_b32 s6, s9, 16
+; GCN-HSA-NEXT: s_mov_b32 s28, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27
+; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[18:19], 0x80000
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_readfirstlane_b32 s25, v5
-; GCN-HSA-NEXT: v_readfirstlane_b32 s24, v4
-; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GCN-HSA-NEXT: v_readfirstlane_b32 s37, v3
-; GCN-HSA-NEXT: v_readfirstlane_b32 s36, v2
+; GCN-HSA-NEXT: v_readfirstlane_b32 s40, v2
+; GCN-HSA-NEXT: v_readfirstlane_b32 s41, v3
+; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[10:11], 0x80000
+; GCN-HSA-NEXT: v_readfirstlane_b32 s44, v0
+; GCN-HSA-NEXT: v_readfirstlane_b32 s45, v1
; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27
-; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[30:31], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[28:29], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[12:13], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[10:11], 0x80000
+; GCN-HSA-NEXT: s_mov_b32 s22, s7
+; GCN-HSA-NEXT: s_lshr_b32 s8, s9, 8
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25
+; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[16:17], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[14:15], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[12:13], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[4:5], 0x80000
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s22
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s14
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s20
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s26
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s27
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s42
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT: s_lshr_b32 s42, s44, 16
+; GCN-HSA-NEXT: s_lshr_b32 s48, s44, 24
+; GCN-HSA-NEXT: s_lshr_b32 s28, s44, 8
+; GCN-HSA-NEXT: s_lshr_b32 s6, s45, 16
+; GCN-HSA-NEXT: s_lshr_b32 s2, s45, 8
+; GCN-HSA-NEXT: s_mov_b32 s4, s45
+; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x80000
+; GCN-HSA-NEXT: s_lshr_b32 s44, s40, 16
+; GCN-HSA-NEXT: s_lshr_b32 s50, s40, 24
+; GCN-HSA-NEXT: s_lshr_b32 s52, s40, 8
+; GCN-HSA-NEXT: s_lshr_b32 s20, s41, 16
+; GCN-HSA-NEXT: s_lshr_b32 s12, s41, 8
+; GCN-HSA-NEXT: s_mov_b32 s14, s41
+; GCN-HSA-NEXT: s_ashr_i32 s33, s9, 31
+; GCN-HSA-NEXT: s_ashr_i32 s37, s7, 31
+; GCN-HSA-NEXT: s_ashr_i32 s38, s7, 24
+; GCN-HSA-NEXT: s_ashr_i32 s34, s9, 24
+; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s21
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
-; GCN-HSA-NEXT: s_lshr_b32 s14, s37, 16
-; GCN-HSA-NEXT: s_lshr_b32 s10, s37, 8
-; GCN-HSA-NEXT: s_mov_b32 s12, s37
-; GCN-HSA-NEXT: s_lshr_b32 s8, s36, 16
-; GCN-HSA-NEXT: s_lshr_b32 s6, s36, 24
-; GCN-HSA-NEXT: s_lshr_b32 s4, s36, 8
-; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[36:37], 0x80000
-; GCN-HSA-NEXT: s_lshr_b32 s38, s25, 16
-; GCN-HSA-NEXT: s_lshr_b32 s28, s25, 8
-; GCN-HSA-NEXT: s_mov_b32 s36, s25
-; GCN-HSA-NEXT: s_lshr_b32 s48, s24, 16
-; GCN-HSA-NEXT: s_lshr_b32 s22, s24, 24
-; GCN-HSA-NEXT: s_lshr_b32 s18, s24, 8
-; GCN-HSA-NEXT: s_ashr_i32 s50, s7, 31
-; GCN-HSA-NEXT: s_ashr_i32 s51, s7, 24
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s9
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s16
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s17
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s23
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s15
-; GCN-HSA-NEXT: s_ashr_i32 s33, s37, 31
-; GCN-HSA-NEXT: s_ashr_i32 s42, s37, 24
-; GCN-HSA-NEXT: s_ashr_i32 s53, s25, 31
-; GCN-HSA-NEXT: s_ashr_i32 s54, s25, 24
-; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[24:25], 0x80000
+; GCN-HSA-NEXT: s_ashr_i32 s30, s45, 31
+; GCN-HSA-NEXT: s_ashr_i32 s31, s45, 24
+; GCN-HSA-NEXT: s_ashr_i32 s35, s41, 31
+; GCN-HSA-NEXT: s_ashr_i32 s36, s41, 24
+; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[48:49], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
-; GCN-HSA-NEXT: s_add_u32 s48, s0, 0x70
-; GCN-HSA-NEXT: s_addc_u32 s49, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v18, s48
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, s49
-; GCN-HSA-NEXT: s_add_u32 s48, s0, 0x60
-; GCN-HSA-NEXT: s_addc_u32 s49, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s48
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s49
-; GCN-HSA-NEXT: s_add_u32 s48, s0, 0x50
-; GCN-HSA-NEXT: s_addc_u32 s49, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49
-; GCN-HSA-NEXT: s_add_u32 s48, s0, 64
-; GCN-HSA-NEXT: s_addc_u32 s49, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s44
-; GCN-HSA-NEXT: s_add_u32 s44, s0, 48
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s45
-; GCN-HSA-NEXT: s_addc_u32 s45, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, s48
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s50
-; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7]
-; GCN-HSA-NEXT: v_mov_b32_e32 v29, s49
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s34
-; GCN-HSA-NEXT: s_add_u32 s34, s0, 32
-; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s35
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s44
-; GCN-HSA-NEXT: s_addc_u32 s35, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, s46
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s47
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s45
-; GCN-HSA-NEXT: v_mov_b32_e32 v18, s52
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, s43
+; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
+; GCN-HSA-NEXT: s_add_u32 s54, s0, 0x50
+; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s54
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s55
+; GCN-HSA-NEXT: s_add_u32 s54, s0, 64
+; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s54
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s55
+; GCN-HSA-NEXT: s_add_u32 s54, s0, 16
+; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40
+; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xd0
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s41
+; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s54
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s40
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s41
+; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xc0
+; GCN-HSA-NEXT: v_mov_b32_e32 v29, s55
+; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[16:19]
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20
-; GCN-HSA-NEXT: s_add_u32 s20, s0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21
-; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, s20
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s21
-; GCN-HSA-NEXT: s_add_u32 s20, s0, 0xf0
; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15]
-; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s34
-; GCN-HSA-NEXT: v_mov_b32_e32 v18, s20
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s35
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s30
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s31
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, s21
-; GCN-HSA-NEXT: s_add_u32 s20, s0, 0xe0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[20:23]
-; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s20
-; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7]
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s21
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT: s_add_u32 s20, s0, 0xd0
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s38
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s39
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s54
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s53
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s36
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s37
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s28
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s29
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
-; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s20
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s23
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s21
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: s_nop 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26
+; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x90
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27
+; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
+; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x80
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
+; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s44
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s24
+; GCN-HSA-NEXT: s_add_u32 s24, s0, 0x70
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s45
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, s50
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s51
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s40
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s25
+; GCN-HSA-NEXT: s_addc_u32 s25, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s52
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s42
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s43
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23]
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s18
+; GCN-HSA-NEXT: s_add_u32 s18, s0, 0x60
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27
+; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15]
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s19
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24
+; GCN-HSA-NEXT: s_addc_u32 s19, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s38
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19
+; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16
-; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xc0
+; GCN-HSA-NEXT: s_add_u32 s16, s0, 48
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17
; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_nop 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8
+; GCN-HSA-NEXT: s_add_u32 s8, s0, 32
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9
+; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
+; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xf0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
+; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xe0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
-; GCN-HSA-NEXT: s_add_u32 s14, s0, 0xb0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15
-; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: s_nop 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10
-; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xa0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11
-; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_nop 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6
-; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x90
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
+; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xb0
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x80
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9
+; GCN-HSA-NEXT: s_add_u32 s0, s0, 0xa0
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
@@ -8584,155 +8584,155 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v3
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v2
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s9, 16
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v1
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s9, 8
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s9
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s8, 8
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s71, s9, 31
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s72, s9, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v0
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s11, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s13, v7
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s12, v6
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s15, v5
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s14, v4
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s11, 8
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s54, s11
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s69, s11, 31
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s11, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s40
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s41
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s38
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s39
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v7
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v5
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s4, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s4, 24
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s9, 8
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s39, s11, 24
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s6, 8
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s10, 16
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s10, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s11, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s11, 8
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s66, s11
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[10:11], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[68:69], s[4:5], 0x80000
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s11, 31
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[38:39], 0x80000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s8, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s8, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s69
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s61
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s40
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s41
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s8, 8
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s10, 8
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x80000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s13, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s13, 8
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s5, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s5, 8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s58
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s36, s13
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s13, 31
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s13, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s12, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s60
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s12, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s12, 8
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s5
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s71, s5, 31
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s72, s5, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s7, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s37
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s15, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s7, 8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, s7
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s7, 31
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s7, 24
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s15, 8
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s9, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s16, s15
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s15, 31
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s15, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s14, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s14, 24
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s14, 8
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[62:63], 0x80000
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s8, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s63
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s8, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[66:67], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[64:65], 0x80000
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[68:69], 0x80000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s42
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s9, 31
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s9, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s6, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s65
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s6, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x80000
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x80000
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s43
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s44
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s45
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s44
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s39
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s45
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s46
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s47
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v32i8_to_v32i64:
@@ -10309,33 +10309,34 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i16(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, 0xffff
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s5, 16
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s5
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s4
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s9, s5, 0x80000
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s4
+; GCN-NOHSA-VI-NEXT: s_bfe_i32 s10, s4, 0x80000
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s5, s5, 24
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s4, s4, 24
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 8
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 8
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s7, s7, 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s6, s6, 0x80000
; GCN-NOHSA-VI-NEXT: s_and_b32 s9, 0xffff, s9
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s10, s10, 8
+; GCN-NOHSA-VI-NEXT: s_and_b32 s10, 0xffff, s10
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 16
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff0000
+; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff0000
; GCN-NOHSA-VI-NEXT: s_and_b32 s7, 0xffff, s7
; GCN-NOHSA-VI-NEXT: s_and_b32 s6, 0xffff, s6
-; GCN-NOHSA-VI-NEXT: v_and_b32_sdwa v0, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff0000
; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s9, s8
+; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s10, s11
; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s7, s5
; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s6, s4
-; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, s10, v0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
@@ -10777,27 +10778,27 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s4, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s5, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s6, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s7, 16
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s7
-; GCN-NOHSA-VI-NEXT: s_bfe_i32 s13, s7, 0x80000
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s7, 16
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s5
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s4
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s6
-; GCN-NOHSA-VI-NEXT: s_bfe_i32 s15, s6, 0x80000
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s6, s6, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i32 s17, s5, 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i32 s19, s4, 0x80000
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s5, s5, 24
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v0
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v2
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v3
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s6, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s7, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s4, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s5, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s5
+; GCN-NOHSA-VI-NEXT: s_bfe_i32 s13, s5, 0x80000
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s5, s5, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s7
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s6
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s4
+; GCN-NOHSA-VI-NEXT: s_bfe_i32 s15, s4, 0x80000
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s4, s4, 24
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8
+; GCN-NOHSA-VI-NEXT: s_bfe_i32 s17, s7, 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i32 s19, s6, 0x80000
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s7, 24
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s6, s6, 24
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s11, s11, 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s10, s10, 0x80000
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s16, s16, 8
@@ -10806,12 +10807,12 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s8, s8, 0x80000
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s12, s12, 8
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 8
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s17, 0xffff, s17
; GCN-NOHSA-VI-NEXT: s_and_b32 s19, 0xffff, s19
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 16
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff0000
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 16
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff0000
; GCN-NOHSA-VI-NEXT: s_and_b32 s11, 0xffff, s11
; GCN-NOHSA-VI-NEXT: s_and_b32 s10, 0xffff, s10
; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, 0xffff0000
@@ -10822,22 +10823,22 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: s_and_b32 s15, 0xffff, s15
; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff0000
; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff0000
-; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s11, s7
-; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s10, s6
+; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s11, s5
+; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s10, s4
; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s17, s16
; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s19, s18
-; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s9, s5
-; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s8, s4
+; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s9, s7
+; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s8, s6
; GCN-NOHSA-VI-NEXT: s_or_b32 s12, s13, s12
; GCN-NOHSA-VI-NEXT: s_or_b32 s13, s15, s14
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s13
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s4
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s12
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s5
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -11577,8 +11578,8 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
@@ -11586,38 +11587,39 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s6, 16
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s7, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s28, s7
+; GCN-NOHSA-VI-NEXT: s_bfe_i32 s29, s7, 0x80000
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s30, s6
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s31, s6, 0x80000
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s7, 16
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s6, s6, 24
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s14, s14, 0x80000
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s28, s7
-; GCN-NOHSA-VI-NEXT: s_bfe_i32 s29, s7, 0x80000
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s7, 24
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s15, s15, 0x80000
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s14, 0xffff, s14
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v5
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 16
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v4
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v5
+; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff0000
; GCN-NOHSA-VI-NEXT: s_and_b32 s15, 0xffff, s15
; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s14, s6
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s5
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s20, s11
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s22, s10
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s20, s9
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s22, s8
; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s15, s7
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 8
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s15, s5, 0x80000
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2
-; GCN-NOHSA-VI-NEXT: s_bfe_i32 s21, s11, 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i32 s23, s10, 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i32 s21, s9, 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i32 s23, s8, 0x80000
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s20, s20, 8
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s22, s22, 8
; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff0000
; GCN-NOHSA-VI-NEXT: s_and_b32 s15, 0xffff, s15
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v7
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v6
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v7
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s4, 16
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s5, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s21, 0xffff, s21
@@ -11627,11 +11629,11 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: s_or_b32 s14, s15, s14
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s4
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s5, s5, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s8, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s9, 16
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s24, s9
-; GCN-NOHSA-VI-NEXT: s_bfe_i32 s25, s9, 0x80000
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s9, s9, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s10, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s11, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s24, s11
+; GCN-NOHSA-VI-NEXT: s_bfe_i32 s25, s11, 0x80000
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s11, s11, 16
; GCN-NOHSA-VI-NEXT: s_or_b32 s20, s21, s20
; GCN-NOHSA-VI-NEXT: s_or_b32 s21, s23, s22
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s15, s15, 8
@@ -11640,13 +11642,12 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s13, s13, 0x80000
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s4, s4, 24
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s12, s12, 0x80000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s10, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s11, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s11, s11, 16
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s26, s8
-; GCN-NOHSA-VI-NEXT: s_bfe_i32 s27, s8, 0x80000
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s8, s8, 24
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s8, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s9, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s26, s10
+; GCN-NOHSA-VI-NEXT: s_bfe_i32 s27, s10, 0x80000
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s10, s10, 24
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 8
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s17, s17, 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s16, s16, 0x80000
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s28, s28, 8
@@ -11657,16 +11658,16 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: s_and_b32 s13, 0xffff, s13
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s12, 0xffff, s12
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s10, s10, 24
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 8
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s9, s9, 24
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s8, s8, 24
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s19, s19, 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s18, s18, 0x80000
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s24, s24, 8
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s26, s26, 8
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 16
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s10, s10, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s29, 0xffff, s29
; GCN-NOHSA-VI-NEXT: s_and_b32 s31, 0xffff, s31
-; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff0000
+; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff0000
; GCN-NOHSA-VI-NEXT: s_and_b32 s17, 0xffff, s17
; GCN-NOHSA-VI-NEXT: s_and_b32 s16, 0xffff, s16
; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff0000
@@ -11674,45 +11675,45 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: s_or_b32 s15, s22, s15
; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s13, s5
; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s12, s4
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s10, s10, 16
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 16
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s25, 0xffff, s25
; GCN-NOHSA-VI-NEXT: s_and_b32 s27, 0xffff, s27
-; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff0000
; GCN-NOHSA-VI-NEXT: s_and_b32 s19, 0xffff, s19
; GCN-NOHSA-VI-NEXT: s_and_b32 s18, 0xffff, s18
; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s24, 0xffff0000
; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, 0xffff0000
-; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s17, s9
-; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s16, s8
+; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s17, s11
+; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s16, s10
; GCN-NOHSA-VI-NEXT: s_or_b32 s16, s29, s28
; GCN-NOHSA-VI-NEXT: s_or_b32 s17, s31, s30
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
-; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s19, s11
-; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s18, s10
+; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s19, s9
+; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s18, s8
; GCN-NOHSA-VI-NEXT: s_or_b32 s18, s25, s24
; GCN-NOHSA-VI-NEXT: s_or_b32 s19, s27, s26
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_nop 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s17
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_nop 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: s_nop 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v32i8_to_v32i16:
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index 8dcecfe291177..8e985d09bb755 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -1075,12 +1075,12 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(ptr addrspace(3) %out,
; SI-NEXT: s_mov_b32 m0, -1
; SI-NEXT: ds_read_b64 v[0:1], v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v1
-; SI-NEXT: v_ashrrev_i32_e32 v4, 16, v0
-; SI-NEXT: v_bfe_i32 v1, v1, 0, 16
-; SI-NEXT: v_bfe_i32 v3, v0, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
+; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v1
+; SI-NEXT: v_bfe_i32 v2, v0, 0, 16
+; SI-NEXT: v_bfe_i32 v4, v1, 0, 16
; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset1:1
+; SI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
; SI-NEXT: s_endpgm
;
; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i32:
@@ -6145,11 +6145,11 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out,
; SI-NEXT: v_bfe_i32 v8, v1, 0, 16
; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
; SI-NEXT: v_bfe_i32 v10, v9, 0, 16
-; SI-NEXT: v_bfe_i32 v12, v12, 0, 16
-; SI-NEXT: v_bfe_i32 v14, v11, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT: v_bfe_i32 v12, v12, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: v_bfe_i32 v14, v11, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
@@ -6811,10 +6811,10 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out
; SI-NEXT: v_mov_b32_e32 v18, s0
; SI-NEXT: s_waitcnt lgkmcnt(1)
; SI-NEXT: v_mov_b32_e32 v12, v3
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v14, v7
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v14, v7
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4
; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v5
@@ -6837,24 +6837,24 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out
; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
; SI-NEXT: v_bfe_i32 v3, v5, 0, 16
; SI-NEXT: v_bfe_i32 v5, v6, 0, 16
-; SI-NEXT: v_bfe_i32 v7, v0, 0, 16
-; SI-NEXT: v_bfe_i32 v10, v2, 0, 16
+; SI-NEXT: v_bfe_i32 v10, v0, 0, 16
+; SI-NEXT: v_bfe_i32 v7, v2, 0, 16
; SI-NEXT: v_bfe_i32 v12, v19, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; SI-NEXT: v_bfe_i32 v14, v17, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; SI-NEXT: v_bfe_i32 v16, v16, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; SI-NEXT: ds_write2_b64 v18, v[3:4], v[8:9] offset0:2 offset1:3
; SI-NEXT: v_bfe_i32 v3, v15, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7
-; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; SI-NEXT: ds_write2_b64 v18, v[10:11], v[3:4] offset0:12 offset1:13
-; SI-NEXT: ds_write2_b64 v18, v[7:8], v[16:17] offset0:8 offset1:9
+; SI-NEXT: ds_write2_b64 v18, v[7:8], v[3:4] offset0:12 offset1:13
+; SI-NEXT: ds_write2_b64 v18, v[10:11], v[16:17] offset0:8 offset1:9
; SI-NEXT: ds_write2_b64 v18, v[5:6], v[14:15] offset0:4 offset1:5
; SI-NEXT: ds_write2_b64 v18, v[1:2], v[12:13] offset1:1
; SI-NEXT: s_endpgm
@@ -8106,16 +8106,16 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out
; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; SI-NEXT: ds_write2_b64 v7, v[10:11], v[12:13] offset0:4 offset1:5
; SI-NEXT: v_bfe_i32 v11, v6, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; SI-NEXT: v_bfe_i32 v13, v4, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; SI-NEXT: v_bfe_i32 v15, v15, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; SI-NEXT: v_bfe_i32 v16, v14, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; SI-NEXT: ds_write2_b64 v7, v[9:10], v[16:17] offset1:1
; SI-NEXT: v_bfe_i32 v17, v18, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 8fe68ba748971..5087bdb9d8f7b 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -533,8 +533,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1
; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9]
-; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1
-; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-O0-NEXT: s_mov_b64 s[14:15], -1
+; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
@@ -1912,8 +1913,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1
; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9]
-; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1
-; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-O0-NEXT: s_mov_b64 s[14:15], -1
+; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index 676359fcec462..5c0f813c8c829 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -391,144 +391,156 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa
define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-LABEL: sdiv_v2i32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s10, s6
-; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s10, s2
+; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s8, s2
-; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: s_mov_b32 s8, s6
+; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
-; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
-; GCN-NEXT: v_xor_b32_e32 v4, v0, v2
-; GCN-NEXT: v_xor_b32_e32 v7, v1, v3
-; GCN-NEXT: v_max_i32_e32 v2, v2, v6
-; GCN-NEXT: v_max_i32_e32 v3, v3, v9
-; GCN-NEXT: v_cvt_f32_u32_e32 v6, v2
-; GCN-NEXT: v_cvt_f32_u32_e32 v9, v3
-; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0
-; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GCN-NEXT: v_max_i32_e32 v0, v0, v5
-; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v9
-; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2
-; GCN-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
-; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v3
-; GCN-NEXT: v_mul_lo_u32 v9, v9, v6
-; GCN-NEXT: v_mul_lo_u32 v10, v10, v5
-; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v1
-; GCN-NEXT: v_mul_hi_u32 v9, v6, v9
-; GCN-NEXT: v_max_i32_e32 v1, v1, v8
-; GCN-NEXT: v_mul_hi_u32 v8, v5, v10
-; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v9
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GCN-NEXT: v_mul_hi_u32 v6, v0, v6
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v5
-; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7
-; GCN-NEXT: v_mul_lo_u32 v8, v6, v2
-; GCN-NEXT: v_mul_lo_u32 v10, v5, v3
-; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v6
-; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
-; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v10
-; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5
-; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
-; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
-; GCN-NEXT: v_sub_i32_e32 v8, vcc, v0, v2
-; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1]
-; GCN-NEXT: v_sub_i32_e32 v9, vcc, v1, v3
-; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3]
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1]
-; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v6
-; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3]
-; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v5
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v4
-; GCN-NEXT: v_xor_b32_e32 v1, v1, v7
-; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: v_readfirstlane_b32 s0, v2
+; GCN-NEXT: s_abs_i32 s1, s0
+; GCN-NEXT: v_cvt_f32_u32_e32 v2, s1
+; GCN-NEXT: s_sub_i32 s6, 0, s1
+; GCN-NEXT: v_readfirstlane_b32 s8, v3
+; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
+; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT: v_mul_lo_u32 v4, s6, v2
+; GCN-NEXT: v_readfirstlane_b32 s6, v0
+; GCN-NEXT: s_abs_i32 s7, s6
+; GCN-NEXT: s_xor_b32 s0, s6, s0
+; GCN-NEXT: v_mul_hi_u32 v4, v2, v4
+; GCN-NEXT: s_ashr_i32 s6, s0, 31
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v4
+; GCN-NEXT: v_mul_hi_u32 v0, s7, v0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_mul_i32 s0, s0, s1
+; GCN-NEXT: s_sub_i32 s0, s7, s0
+; GCN-NEXT: s_sub_i32 s7, s0, s1
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0
+; GCN-NEXT: s_cmp_ge_u32 s0, s1
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT: s_cselect_b32 s0, s7, s0
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0
+; GCN-NEXT: s_cmp_ge_u32 s0, s1
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: s_abs_i32 s7, s8
+; GCN-NEXT: v_cvt_f32_u32_e32 v3, s7
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_sub_i32 s4, 0, s7
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, s6, v0
+; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0
+; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s4, v3
+; GCN-NEXT: v_readfirstlane_b32 s4, v1
+; GCN-NEXT: s_xor_b32 s5, s4, s8
+; GCN-NEXT: s_abs_i32 s4, s4
+; GCN-NEXT: v_mul_hi_u32 v1, v3, v4
+; GCN-NEXT: s_ashr_i32 s5, s5, 31
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; GCN-NEXT: v_mul_hi_u32 v1, s4, v1
+; GCN-NEXT: v_readfirstlane_b32 s6, v1
+; GCN-NEXT: s_mul_i32 s6, s6, s7
+; GCN-NEXT: s_sub_i32 s4, s4, s6
+; GCN-NEXT: s_sub_i32 s6, s4, s7
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1
+; GCN-NEXT: s_cmp_ge_u32 s4, s7
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT: s_cselect_b32 s4, s6, s4
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1
+; GCN-NEXT: s_cmp_ge_u32 s4, s7
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT: v_xor_b32_e32 v1, s5, v1
+; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s5, v1
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; TONGA-LABEL: sdiv_v2i32:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
+; TONGA-NEXT: s_mov_b32 s3, 0xf000
+; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v2
-; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v3
-; TONGA-NEXT: v_xor_b32_e32 v4, v0, v2
-; TONGA-NEXT: v_xor_b32_e32 v7, v1, v3
-; TONGA-NEXT: v_max_i32_e32 v2, v2, v6
-; TONGA-NEXT: v_max_i32_e32 v3, v3, v9
-; TONGA-NEXT: v_cvt_f32_u32_e32 v6, v2
-; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v3
-; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; TONGA-NEXT: v_max_i32_e32 v0, v0, v5
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v9
-; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2
-; TONGA-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; TONGA-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
-; TONGA-NEXT: v_cvt_u32_f32_e32 v6, v6
-; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5
-; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v3
-; TONGA-NEXT: v_mul_lo_u32 v9, v9, v6
-; TONGA-NEXT: v_mul_lo_u32 v10, v10, v5
-; TONGA-NEXT: v_sub_u32_e32 v8, vcc, 0, v1
-; TONGA-NEXT: v_mul_hi_u32 v9, v6, v9
-; TONGA-NEXT: v_max_i32_e32 v1, v1, v8
-; TONGA-NEXT: v_mul_hi_u32 v8, v5, v10
-; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v9
-; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v8
-; TONGA-NEXT: v_mul_hi_u32 v6, v0, v6
-; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5
-; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v7
-; TONGA-NEXT: v_mul_lo_u32 v8, v6, v2
-; TONGA-NEXT: v_mul_lo_u32 v10, v5, v3
-; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v6
-; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8
-; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v10
-; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5
-; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
-; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
-; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v0, v2
-; TONGA-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1]
-; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v1, v3
-; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3]
-; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1]
-; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v6
-; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3]
-; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v5
-; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; TONGA-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc
-; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
-; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4
-; TONGA-NEXT: v_xor_b32_e32 v1, v1, v7
-; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
-; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v7
-; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; TONGA-NEXT: v_readfirstlane_b32 s0, v2
+; TONGA-NEXT: s_abs_i32 s1, s0
+; TONGA-NEXT: v_cvt_f32_u32_e32 v2, s1
+; TONGA-NEXT: s_sub_i32 s6, 0, s1
+; TONGA-NEXT: v_readfirstlane_b32 s8, v3
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2
+; TONGA-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2
+; TONGA-NEXT: v_mul_lo_u32 v4, s6, v2
+; TONGA-NEXT: v_readfirstlane_b32 s6, v0
+; TONGA-NEXT: s_abs_i32 s7, s6
+; TONGA-NEXT: s_xor_b32 s0, s6, s0
+; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4
+; TONGA-NEXT: s_ashr_i32 s6, s0, 31
+; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v4
+; TONGA-NEXT: v_mul_hi_u32 v0, s7, v0
+; TONGA-NEXT: v_readfirstlane_b32 s0, v0
+; TONGA-NEXT: s_mul_i32 s0, s0, s1
+; TONGA-NEXT: s_sub_i32 s0, s7, s0
+; TONGA-NEXT: s_sub_i32 s7, s0, s1
+; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; TONGA-NEXT: s_cmp_ge_u32 s0, s1
+; TONGA-NEXT: s_cselect_b64 vcc, -1, 0
+; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; TONGA-NEXT: s_cselect_b32 s0, s7, s0
+; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; TONGA-NEXT: s_cmp_ge_u32 s0, s1
+; TONGA-NEXT: s_cselect_b64 vcc, -1, 0
+; TONGA-NEXT: s_abs_i32 s7, s8
+; TONGA-NEXT: v_cvt_f32_u32_e32 v3, s7
+; TONGA-NEXT: s_mov_b32 s0, s4
+; TONGA-NEXT: s_sub_i32 s4, 0, s7
+; TONGA-NEXT: s_mov_b32 s1, s5
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3
+; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; TONGA-NEXT: v_xor_b32_e32 v0, s6, v0
+; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, s6, v0
+; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3
+; TONGA-NEXT: v_mul_lo_u32 v4, s4, v3
+; TONGA-NEXT: v_readfirstlane_b32 s4, v1
+; TONGA-NEXT: s_xor_b32 s5, s4, s8
+; TONGA-NEXT: s_abs_i32 s4, s4
+; TONGA-NEXT: v_mul_hi_u32 v1, v3, v4
+; TONGA-NEXT: s_ashr_i32 s5, s5, 31
+; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; TONGA-NEXT: v_mul_hi_u32 v1, s4, v1
+; TONGA-NEXT: v_readfirstlane_b32 s6, v1
+; TONGA-NEXT: s_mul_i32 s6, s6, s7
+; TONGA-NEXT: s_sub_i32 s4, s4, s6
+; TONGA-NEXT: s_sub_i32 s6, s4, s7
+; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v1
+; TONGA-NEXT: s_cmp_ge_u32 s4, s7
+; TONGA-NEXT: s_cselect_b64 vcc, -1, 0
+; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; TONGA-NEXT: s_cselect_b32 s4, s6, s4
+; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v1
+; TONGA-NEXT: s_cmp_ge_u32 s4, s7
+; TONGA-NEXT: s_cselect_b64 vcc, -1, 0
+; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; TONGA-NEXT: v_xor_b32_e32 v1, s5, v1
+; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s5, v1
+; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v2i32:
@@ -546,44 +558,44 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_readfirstlane_b32 s0, v2
; GFX9-NEXT: s_abs_i32 s1, s0
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s1
-; GFX9-NEXT: v_readfirstlane_b32 s5, v0
-; GFX9-NEXT: s_xor_b32 s0, s5, s0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_xor_b32 s0, s4, s0
; GFX9-NEXT: s_ashr_i32 s6, s0, 31
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX9-NEXT: s_sub_i32 s0, 0, s1
-; GFX9-NEXT: s_abs_i32 s5, s5
-; GFX9-NEXT: v_readfirstlane_b32 s4, v3
+; GFX9-NEXT: s_abs_i32 s4, s4
+; GFX9-NEXT: v_readfirstlane_b32 s5, v3
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s7, v0
; GFX9-NEXT: s_mul_i32 s0, s0, s7
; GFX9-NEXT: s_mul_hi_u32 s0, s7, s0
; GFX9-NEXT: s_add_i32 s7, s7, s0
-; GFX9-NEXT: s_mul_hi_u32 s0, s5, s7
+; GFX9-NEXT: s_mul_hi_u32 s0, s4, s7
; GFX9-NEXT: s_mul_i32 s7, s0, s1
-; GFX9-NEXT: s_sub_i32 s5, s5, s7
+; GFX9-NEXT: s_sub_i32 s4, s4, s7
; GFX9-NEXT: s_add_i32 s10, s0, 1
-; GFX9-NEXT: s_sub_i32 s7, s5, s1
-; GFX9-NEXT: s_cmp_ge_u32 s5, s1
+; GFX9-NEXT: s_sub_i32 s7, s4, s1
+; GFX9-NEXT: s_cmp_ge_u32 s4, s1
; GFX9-NEXT: s_cselect_b32 s0, s10, s0
-; GFX9-NEXT: s_cselect_b32 s5, s7, s5
+; GFX9-NEXT: s_cselect_b32 s4, s7, s4
; GFX9-NEXT: s_add_i32 s7, s0, 1
-; GFX9-NEXT: s_cmp_ge_u32 s5, s1
-; GFX9-NEXT: s_cselect_b32 s5, s7, s0
-; GFX9-NEXT: s_abs_i32 s7, s4
+; GFX9-NEXT: s_cmp_ge_u32 s4, s1
+; GFX9-NEXT: s_cselect_b32 s4, s7, s0
+; GFX9-NEXT: s_abs_i32 s7, s5
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT: s_xor_b32 s5, s5, s6
+; GFX9-NEXT: s_xor_b32 s4, s4, s6
; GFX9-NEXT: s_mov_b32 s1, s9
; GFX9-NEXT: s_sub_i32 s9, 0, s7
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_sub_i32 s5, s5, s6
+; GFX9-NEXT: s_sub_i32 s4, s4, s6
; GFX9-NEXT: s_mov_b32 s0, s8
; GFX9-NEXT: v_readfirstlane_b32 s8, v1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s4, s8, s4
+; GFX9-NEXT: s_xor_b32 s5, s8, s5
; GFX9-NEXT: s_abs_i32 s8, s8
-; GFX9-NEXT: s_ashr_i32 s4, s4, 31
+; GFX9-NEXT: s_ashr_i32 s5, s5, 31
; GFX9-NEXT: v_readfirstlane_b32 s6, v0
; GFX9-NEXT: s_mul_i32 s9, s9, s6
; GFX9-NEXT: s_mul_hi_u32 s9, s6, s9
@@ -599,10 +611,10 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_add_i32 s9, s6, 1
; GFX9-NEXT: s_cmp_ge_u32 s8, s7
; GFX9-NEXT: s_cselect_b32 s6, s9, s6
-; GFX9-NEXT: s_xor_b32 s6, s6, s4
-; GFX9-NEXT: s_sub_i32 s4, s6, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s5
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: s_xor_b32 s6, s6, s5
+; GFX9-NEXT: s_sub_i32 s5, s6, s5
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -792,255 +804,255 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: sdiv_v4i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s10, s6
-; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NEXT: s_mov_b32 s10, -1
+; GCN-NEXT: s_mov_b32 s6, s10
+; GCN-NEXT: s_mov_b32 s7, s11
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s8, s2
-; GCN-NEXT: s_mov_b32 s9, s3
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_mov_b32 s4, s2
+; GCN-NEXT: s_mov_b32 s5, s3
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
+; GCN-NEXT: s_mov_b32 s8, s0
+; GCN-NEXT: s_mov_b32 s9, s1
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4
-; GCN-NEXT: v_xor_b32_e32 v8, v0, v4
-; GCN-NEXT: v_max_i32_e32 v4, v4, v10
-; GCN-NEXT: v_cvt_f32_u32_e32 v10, v4
-; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v5
-; GCN-NEXT: v_xor_b32_e32 v11, v1, v5
-; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10
-; GCN-NEXT: v_max_i32_e32 v5, v5, v13
-; GCN-NEXT: v_cvt_f32_u32_e32 v13, v5
-; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v4
-; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10
-; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10
-; GCN-NEXT: v_rcp_iflag_f32_e32 v13, v13
-; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v1
-; GCN-NEXT: v_mul_lo_u32 v16, v16, v10
-; GCN-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13
-; GCN-NEXT: v_cvt_u32_f32_e32 v13, v13
-; GCN-NEXT: v_max_i32_e32 v0, v0, v9
-; GCN-NEXT: v_mul_hi_u32 v16, v10, v16
-; GCN-NEXT: v_max_i32_e32 v1, v1, v12
-; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v6
-; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v16
-; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v5
-; GCN-NEXT: v_mul_lo_u32 v16, v16, v13
-; GCN-NEXT: v_mul_hi_u32 v10, v0, v10
-; GCN-NEXT: v_xor_b32_e32 v14, v2, v6
-; GCN-NEXT: v_max_i32_e32 v6, v6, v15
-; GCN-NEXT: v_mul_hi_u32 v12, v13, v16
-; GCN-NEXT: v_cvt_f32_u32_e32 v15, v6
-; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8
-; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11
-; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GCN-NEXT: v_mul_lo_u32 v13, v10, v4
-; GCN-NEXT: v_mul_hi_u32 v12, v1, v12
-; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v15
-; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v14
-; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v13
-; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10
-; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
-; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1]
-; GCN-NEXT: v_sub_i32_e32 v13, vcc, v0, v4
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v13, s[0:1]
-; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
-; GCN-NEXT: v_mul_lo_u32 v0, v12, v5
-; GCN-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9
-; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9
-; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v6
-; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v12
-; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
-; GCN-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3]
-; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v5
-; GCN-NEXT: v_mul_lo_u32 v4, v4, v9
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3]
-; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
-; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v7
-; GCN-NEXT: v_max_i32_e32 v5, v7, v0
-; GCN-NEXT: v_cvt_f32_u32_e32 v0, v5
-; GCN-NEXT: v_mul_hi_u32 v4, v9, v4
-; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: v_readfirstlane_b32 s1, v1
+; GCN-NEXT: v_readfirstlane_b32 s2, v2
+; GCN-NEXT: s_abs_i32 s13, s0
+; GCN-NEXT: s_abs_i32 s14, s1
+; GCN-NEXT: s_abs_i32 s15, s2
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, s13
+; GCN-NEXT: v_cvt_f32_u32_e32 v1, s14
+; GCN-NEXT: v_cvt_f32_u32_e32 v2, s15
+; GCN-NEXT: v_readfirstlane_b32 s6, v3
; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4
-; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2
-; GCN-NEXT: v_max_i32_e32 v2, v2, v9
-; GCN-NEXT: v_mul_hi_u32 v4, v2, v4
+; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1
+; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
+; GCN-NEXT: s_abs_i32 s17, s6
+; GCN-NEXT: v_cvt_f32_u32_e32 v3, s17
; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GCN-NEXT: v_cvt_u32_f32_e32 v9, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, v10, v13, s[0:1]
-; GCN-NEXT: v_xor_b32_e32 v0, v0, v8
-; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
-; GCN-NEXT: v_mul_lo_u32 v8, v4, v6
-; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v1
-; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v5
-; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
-; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3]
-; GCN-NEXT: v_mul_lo_u32 v10, v10, v9
-; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4
-; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6
-; GCN-NEXT: v_xor_b32_e32 v1, v1, v11
-; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
-; GCN-NEXT: v_sub_i32_e32 v8, vcc, v2, v6
-; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v11
-; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
-; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
-; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc
-; GCN-NEXT: v_mul_hi_u32 v4, v9, v10
-; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
-; GCN-NEXT: v_max_i32_e32 v6, v3, v6
-; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4
-; GCN-NEXT: v_mul_hi_u32 v4, v6, v4
-; GCN-NEXT: v_xor_b32_e32 v2, v2, v14
-; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v14
-; GCN-NEXT: v_mul_lo_u32 v8, v4, v5
-; GCN-NEXT: v_xor_b32_e32 v3, v3, v7
-; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; GCN-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
-; GCN-NEXT: v_sub_i32_e32 v8, vcc, v6, v5
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
-; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
-; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3
-; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; GCN-NEXT: v_xor_b32_e32 v4, v4, v3
-; GCN-NEXT: v_sub_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_readfirstlane_b32 s3, v4
+; GCN-NEXT: v_readfirstlane_b32 s4, v5
+; GCN-NEXT: v_readfirstlane_b32 s5, v6
+; GCN-NEXT: s_xor_b32 s12, s3, s0
+; GCN-NEXT: s_xor_b32 s0, s4, s1
+; GCN-NEXT: s_xor_b32 s1, s5, s2
+; GCN-NEXT: s_sub_i32 s2, 0, s13
+; GCN-NEXT: s_ashr_i32 s18, s0, 31
+; GCN-NEXT: s_sub_i32 s0, 0, s14
+; GCN-NEXT: s_ashr_i32 s19, s1, 31
+; GCN-NEXT: s_sub_i32 s1, 0, s15
+; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3
+; GCN-NEXT: v_mul_lo_u32 v4, s2, v0
+; GCN-NEXT: v_mul_lo_u32 v5, s0, v1
+; GCN-NEXT: v_mul_lo_u32 v6, s1, v2
+; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GCN-NEXT: v_mul_hi_u32 v4, v0, v4
+; GCN-NEXT: v_mul_hi_u32 v5, v1, v5
+; GCN-NEXT: v_mul_hi_u32 v6, v2, v6
+; GCN-NEXT: s_sub_i32 s20, 0, s17
+; GCN-NEXT: v_readfirstlane_b32 s7, v7
+; GCN-NEXT: s_abs_i32 s3, s3
+; GCN-NEXT: s_abs_i32 s4, s4
+; GCN-NEXT: s_abs_i32 s5, s5
+; GCN-NEXT: v_mul_lo_u32 v7, s20, v3
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; GCN-NEXT: v_mul_hi_u32 v0, s3, v0
+; GCN-NEXT: v_mul_hi_u32 v1, s4, v1
+; GCN-NEXT: v_mul_hi_u32 v2, s5, v2
+; GCN-NEXT: v_mul_hi_u32 v7, v3, v7
+; GCN-NEXT: v_mul_lo_u32 v4, v0, s13
+; GCN-NEXT: v_mul_lo_u32 v6, v1, s14
+; GCN-NEXT: v_mul_lo_u32 v8, v2, s15
+; GCN-NEXT: s_abs_i32 s16, s7
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GCN-NEXT: v_mul_hi_u32 v3, s16, v3
+; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v4
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, s4, v6
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, s5, v8
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v1
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v2
+; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4
+; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v6
+; GCN-NEXT: v_cmp_le_u32_e64 s[4:5], s15, v8
+; GCN-NEXT: v_subrev_i32_e32 v10, vcc, s13, v4
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1]
+; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s14, v6
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3]
+; GCN-NEXT: v_subrev_i32_e32 v7, vcc, s15, v8
+; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1]
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v0
+; GCN-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3]
+; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v1
+; GCN-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[4:5]
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v2
+; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v4
+; GCN-NEXT: v_mul_lo_u32 v4, v3, s17
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
+; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v5
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GCN-NEXT: v_cmp_le_u32_e32 vcc, s15, v7
+; GCN-NEXT: s_ashr_i32 s12, s12, 31
+; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, s12, v0
+; GCN-NEXT: v_xor_b32_e32 v1, s18, v1
+; GCN-NEXT: v_xor_b32_e32 v2, s19, v2
+; GCN-NEXT: v_sub_i32_e32 v4, vcc, s16, v4
+; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0
+; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s18, v1
+; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s19, v2
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3
+; GCN-NEXT: v_subrev_i32_e32 v6, vcc, s17, v4
+; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v4
+; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3
+; GCN-NEXT: s_xor_b32 s0, s7, s6
+; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v4
+; GCN-NEXT: s_ashr_i32 s0, s0, 31
+; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GCN-NEXT: v_xor_b32_e32 v3, s0, v3
+; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s0, v3
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NEXT: s_endpgm
;
; TONGA-LABEL: sdiv_v4i32:
; TONGA: ; %bb.0:
; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_mov_b32 s11, 0xf000
+; TONGA-NEXT: s_mov_b32 s10, -1
+; TONGA-NEXT: s_mov_b32 s6, s10
+; TONGA-NEXT: s_mov_b32 s7, s11
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
-; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
+; TONGA-NEXT: s_mov_b32 s4, s2
+; TONGA-NEXT: s_mov_b32 s5, s3
+; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
+; TONGA-NEXT: s_mov_b32 s8, s0
+; TONGA-NEXT: s_mov_b32 s9, s1
; TONGA-NEXT: s_waitcnt vmcnt(1)
-; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0
-; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4
-; TONGA-NEXT: v_xor_b32_e32 v8, v0, v4
-; TONGA-NEXT: v_max_i32_e32 v4, v4, v10
-; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v4
-; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v5
-; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10
-; TONGA-NEXT: v_max_i32_e32 v5, v5, v13
-; TONGA-NEXT: v_cvt_f32_u32_e32 v13, v5
-; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v4
-; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10
-; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v13, v13
-; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1
-; TONGA-NEXT: v_mul_lo_u32 v16, v16, v10
-; TONGA-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13
-; TONGA-NEXT: v_cvt_u32_f32_e32 v13, v13
-; TONGA-NEXT: v_max_i32_e32 v0, v0, v9
-; TONGA-NEXT: v_mul_hi_u32 v16, v10, v16
-; TONGA-NEXT: v_max_i32_e32 v1, v1, v12
-; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v6
-; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v16
-; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v5
-; TONGA-NEXT: v_mul_lo_u32 v16, v16, v13
-; TONGA-NEXT: v_mul_hi_u32 v10, v0, v10
-; TONGA-NEXT: v_xor_b32_e32 v14, v2, v6
-; TONGA-NEXT: v_max_i32_e32 v6, v6, v15
-; TONGA-NEXT: v_mul_hi_u32 v12, v13, v16
-; TONGA-NEXT: v_cvt_f32_u32_e32 v15, v6
-; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8
-; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v11
-; TONGA-NEXT: v_add_u32_e32 v12, vcc, v13, v12
-; TONGA-NEXT: v_mul_lo_u32 v13, v10, v4
-; TONGA-NEXT: v_mul_hi_u32 v12, v1, v12
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v15
-; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v14
-; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v13
-; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10
-; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
-; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1]
-; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v0, v4
-; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v13, s[0:1]
-; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
-; TONGA-NEXT: v_mul_lo_u32 v0, v12, v5
-; TONGA-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9
-; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9
-; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v6
-; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0
-; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v12
-; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
-; TONGA-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3]
-; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v5
-; TONGA-NEXT: v_mul_lo_u32 v4, v4, v9
-; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3]
-; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
-; TONGA-NEXT: v_sub_u32_e32 v0, vcc, 0, v7
-; TONGA-NEXT: v_max_i32_e32 v5, v7, v0
-; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v5
-; TONGA-NEXT: v_mul_hi_u32 v4, v9, v4
-; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10
+; TONGA-NEXT: v_readfirstlane_b32 s0, v0
+; TONGA-NEXT: v_readfirstlane_b32 s1, v1
+; TONGA-NEXT: v_readfirstlane_b32 s2, v2
+; TONGA-NEXT: s_abs_i32 s13, s0
+; TONGA-NEXT: s_abs_i32 s14, s1
+; TONGA-NEXT: s_abs_i32 s15, s2
+; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s13
+; TONGA-NEXT: v_cvt_f32_u32_e32 v1, s14
+; TONGA-NEXT: v_cvt_f32_u32_e32 v2, s15
+; TONGA-NEXT: v_readfirstlane_b32 s6, v3
; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4
-; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2
-; TONGA-NEXT: v_max_i32_e32 v2, v2, v9
-; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v1, v1
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2
+; TONGA-NEXT: s_abs_i32 s17, s6
+; TONGA-NEXT: v_cvt_f32_u32_e32 v3, s17
; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v0
-; TONGA-NEXT: v_cndmask_b32_e64 v0, v10, v13, s[0:1]
-; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8
-; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8
-; TONGA-NEXT: v_mul_lo_u32 v8, v4, v6
-; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v1
-; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v5
-; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v8
-; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3]
-; TONGA-NEXT: v_mul_lo_u32 v10, v10, v9
-; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4
-; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6
-; TONGA-NEXT: v_xor_b32_e32 v1, v1, v11
-; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
-; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v2, v6
-; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v11
-; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
-; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4
-; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
-; TONGA-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc
-; TONGA-NEXT: v_mul_hi_u32 v4, v9, v10
-; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v3
-; TONGA-NEXT: v_max_i32_e32 v6, v3, v6
-; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4
-; TONGA-NEXT: v_mul_hi_u32 v4, v6, v4
-; TONGA-NEXT: v_xor_b32_e32 v2, v2, v14
-; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v14
-; TONGA-NEXT: v_mul_lo_u32 v8, v4, v5
-; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7
-; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4
-; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v8
-; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v6, v5
-; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
-; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; TONGA-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
-; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4
-; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
-; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v3
-; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; TONGA-NEXT: v_xor_b32_e32 v4, v4, v3
-; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v4, v3
-; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; TONGA-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; TONGA-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0
+; TONGA-NEXT: v_cvt_u32_f32_e32 v1, v1
+; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2
+; TONGA-NEXT: s_waitcnt vmcnt(0)
+; TONGA-NEXT: v_readfirstlane_b32 s3, v4
+; TONGA-NEXT: v_readfirstlane_b32 s4, v5
+; TONGA-NEXT: v_readfirstlane_b32 s5, v6
+; TONGA-NEXT: s_xor_b32 s12, s3, s0
+; TONGA-NEXT: s_xor_b32 s0, s4, s1
+; TONGA-NEXT: s_xor_b32 s1, s5, s2
+; TONGA-NEXT: s_sub_i32 s2, 0, s13
+; TONGA-NEXT: s_ashr_i32 s18, s0, 31
+; TONGA-NEXT: s_sub_i32 s0, 0, s14
+; TONGA-NEXT: s_ashr_i32 s19, s1, 31
+; TONGA-NEXT: s_sub_i32 s1, 0, s15
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3
+; TONGA-NEXT: v_mul_lo_u32 v4, s2, v0
+; TONGA-NEXT: v_mul_lo_u32 v5, s0, v1
+; TONGA-NEXT: v_mul_lo_u32 v6, s1, v2
+; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3
+; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4
+; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5
+; TONGA-NEXT: v_mul_hi_u32 v6, v2, v6
+; TONGA-NEXT: s_sub_i32 s20, 0, s17
+; TONGA-NEXT: v_readfirstlane_b32 s7, v7
+; TONGA-NEXT: s_abs_i32 s3, s3
+; TONGA-NEXT: s_abs_i32 s4, s4
+; TONGA-NEXT: s_abs_i32 s5, s5
+; TONGA-NEXT: v_mul_lo_u32 v7, s20, v3
+; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4
+; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5
+; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6
+; TONGA-NEXT: v_mul_hi_u32 v0, s3, v0
+; TONGA-NEXT: v_mul_hi_u32 v1, s4, v1
+; TONGA-NEXT: v_mul_hi_u32 v2, s5, v2
+; TONGA-NEXT: v_mul_hi_u32 v7, v3, v7
+; TONGA-NEXT: v_mul_lo_u32 v4, v0, s13
+; TONGA-NEXT: v_mul_lo_u32 v6, v1, s14
+; TONGA-NEXT: v_mul_lo_u32 v8, v2, s15
+; TONGA-NEXT: s_abs_i32 s16, s7
+; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7
+; TONGA-NEXT: v_mul_hi_u32 v3, s16, v3
+; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s3, v4
+; TONGA-NEXT: v_sub_u32_e32 v6, vcc, s4, v6
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, s5, v8
+; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v0
+; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v1
+; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v2
+; TONGA-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4
+; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v6
+; TONGA-NEXT: v_cmp_le_u32_e64 s[4:5], s15, v8
+; TONGA-NEXT: v_subrev_u32_e32 v10, vcc, s13, v4
+; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1]
+; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, s14, v6
+; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3]
+; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, s15, v8
+; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5]
+; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1]
+; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v0
+; TONGA-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3]
+; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v1
+; TONGA-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[4:5]
+; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v2
+; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s13, v4
+; TONGA-NEXT: v_mul_lo_u32 v4, v3, s17
+; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
+; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s14, v5
+; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s15, v7
+; TONGA-NEXT: s_ashr_i32 s12, s12, 31
+; TONGA-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
+; TONGA-NEXT: v_xor_b32_e32 v0, s12, v0
+; TONGA-NEXT: v_xor_b32_e32 v1, s18, v1
+; TONGA-NEXT: v_xor_b32_e32 v2, s19, v2
+; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s16, v4
+; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, s12, v0
+; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s18, v1
+; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, s19, v2
+; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3
+; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, s17, v4
+; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s17, v4
+; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3
+; TONGA-NEXT: s_xor_b32 s0, s7, s6
+; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s17, v4
+; TONGA-NEXT: s_ashr_i32 s0, s0, 31
+; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; TONGA-NEXT: v_xor_b32_e32 v3, s0, v3
+; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, s0, v3
+; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v4i32:
@@ -2002,7 +2014,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GCN-NEXT: v_xor_b32_e32 v1, v1, v0
-; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
@@ -2049,7 +2061,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0
-; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0
+; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v0, v1
; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25
; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; TONGA-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll
index 49dec15f9f7d7..584d26ed41893 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll
@@ -42,34 +42,35 @@ define amdgpu_kernel void @v_sext_in_reg_i8_i16_shuffle_vector(ptr addrspace(1)
; GFX11-FAKE16-LABEL: v_sext_in_reg_i8_i16_shuffle_vector:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, 0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v2, 24, v1
+; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v3, 24, v1
; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v5, 24, v0
; GFX11-FAKE16-NEXT: v_ashrrev_i16 v6, 8, v1
; GFX11-FAKE16-NEXT: v_bfe_i32 v7, v0, 0, 8
; GFX11-FAKE16-NEXT: v_ashrrev_i16 v0, 8, v0
; GFX11-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX11-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX11-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX11-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8
; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v7, v7
; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v0, v0
; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v1, v1
; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v6, v6
; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v5, v5
-; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v8, v2
-; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v2, v4
-; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v4, v3
+; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v8, v3
+; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v4, v4
+; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v9, v2
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v3, v0, v7
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v6, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_pack_b32_f16 v2, v5, v2
-; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v8, v4
-; GFX11-FAKE16-NEXT: global_store_b128 v9, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT: v_pack_b32_f16 v2, v5, v4
+; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v8, v9
+; GFX11-FAKE16-NEXT: global_store_b128 v10, v[0:3], s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr <{ [0 x i8] }>, ptr addrspace(1) %ptr, i64 0, i32 0, i32 %tid
diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll
index 9a051b3fd8bb7..df32e2a4cfad2 100644
--- a/llvm/test/CodeGen/NVPTX/i1-select.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-select.ll
@@ -108,9 +108,9 @@ define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i
; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_folding_param_3];
; CHECK-NEXT: xor.pred %p6, %p1, %p3;
; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_4];
-; CHECK-NEXT: and.pred %p7, %p6, %p4;
+; CHECK-NEXT: and.pred %p8, %p6, %p4;
; CHECK-NEXT: and.pred %p9, %p2, %p4;
-; CHECK-NEXT: and.pred %p10, %p3, %p7;
+; CHECK-NEXT: and.pred %p10, %p3, %p8;
; CHECK-NEXT: or.pred %p11, %p10, %p9;
; CHECK-NEXT: xor.pred %p12, %p11, %p3;
; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p12;
diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll
index 28a95ef4f8de9..f11a9c854c465 100644
--- a/llvm/test/CodeGen/RISCV/abds.ll
+++ b/llvm/test/CodeGen/RISCV/abds.ll
@@ -2011,50 +2011,50 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
; RV32I-LABEL: abd_subnsw_i128:
; RV32I: # %bb.0:
-; RV32I-NEXT: lw a4, 0(a2)
-; RV32I-NEXT: lw a3, 4(a2)
+; RV32I-NEXT: lw a3, 0(a2)
+; RV32I-NEXT: lw a4, 4(a2)
; RV32I-NEXT: lw a5, 8(a2)
-; RV32I-NEXT: lw a6, 12(a2)
+; RV32I-NEXT: lw a2, 12(a2)
; RV32I-NEXT: lw a7, 8(a1)
; RV32I-NEXT: lw t0, 12(a1)
-; RV32I-NEXT: lw a2, 0(a1)
+; RV32I-NEXT: lw a6, 0(a1)
; RV32I-NEXT: lw a1, 4(a1)
; RV32I-NEXT: sltu t1, a7, a5
-; RV32I-NEXT: sub t0, t0, a6
-; RV32I-NEXT: sltu a6, a2, a4
+; RV32I-NEXT: sub t0, t0, a2
+; RV32I-NEXT: sltu a2, a6, a3
; RV32I-NEXT: sub t0, t0, t1
-; RV32I-NEXT: mv t1, a6
-; RV32I-NEXT: beq a1, a3, .LBB31_2
+; RV32I-NEXT: mv t1, a2
+; RV32I-NEXT: beq a1, a4, .LBB31_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: sltu t1, a1, a3
+; RV32I-NEXT: sltu t1, a1, a4
; RV32I-NEXT: .LBB31_2:
; RV32I-NEXT: sub a5, a7, a5
-; RV32I-NEXT: sub a3, a1, a3
-; RV32I-NEXT: sltu a1, a5, t1
+; RV32I-NEXT: sub a1, a1, a4
+; RV32I-NEXT: sltu a4, a5, t1
; RV32I-NEXT: sub a5, a5, t1
-; RV32I-NEXT: sub a1, t0, a1
-; RV32I-NEXT: sub a3, a3, a6
-; RV32I-NEXT: sub a2, a2, a4
-; RV32I-NEXT: bgez a1, .LBB31_4
+; RV32I-NEXT: sub a4, t0, a4
+; RV32I-NEXT: sub a2, a1, a2
+; RV32I-NEXT: sub a1, a6, a3
+; RV32I-NEXT: bgez a4, .LBB31_4
; RV32I-NEXT: # %bb.3:
-; RV32I-NEXT: snez a4, a3
-; RV32I-NEXT: snez a6, a2
+; RV32I-NEXT: snez a3, a2
+; RV32I-NEXT: snez a6, a1
; RV32I-NEXT: neg a7, a5
; RV32I-NEXT: snez a5, a5
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: add a4, a4, a5
+; RV32I-NEXT: add a2, a2, a6
+; RV32I-NEXT: sltu a6, a7, a3
+; RV32I-NEXT: neg a4, a4
+; RV32I-NEXT: sub a5, a7, a3
; RV32I-NEXT: neg a2, a2
-; RV32I-NEXT: or a4, a6, a4
-; RV32I-NEXT: add a1, a1, a5
-; RV32I-NEXT: add a3, a3, a6
-; RV32I-NEXT: sltu a6, a7, a4
+; RV32I-NEXT: sub a4, a4, a6
; RV32I-NEXT: neg a1, a1
-; RV32I-NEXT: sub a5, a7, a4
-; RV32I-NEXT: sub a1, a1, a6
-; RV32I-NEXT: neg a3, a3
; RV32I-NEXT: .LBB31_4:
-; RV32I-NEXT: sw a2, 0(a0)
-; RV32I-NEXT: sw a3, 4(a0)
+; RV32I-NEXT: sw a1, 0(a0)
+; RV32I-NEXT: sw a2, 4(a0)
; RV32I-NEXT: sw a5, 8(a0)
-; RV32I-NEXT: sw a1, 12(a0)
+; RV32I-NEXT: sw a4, 12(a0)
; RV32I-NEXT: ret
;
; RV64I-LABEL: abd_subnsw_i128:
@@ -2074,50 +2074,50 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
;
; RV32ZBB-LABEL: abd_subnsw_i128:
; RV32ZBB: # %bb.0:
-; RV32ZBB-NEXT: lw a4, 0(a2)
-; RV32ZBB-NEXT: lw a3, 4(a2)
+; RV32ZBB-NEXT: lw a3, 0(a2)
+; RV32ZBB-NEXT: lw a4, 4(a2)
; RV32ZBB-NEXT: lw a5, 8(a2)
-; RV32ZBB-NEXT: lw a6, 12(a2)
+; RV32ZBB-NEXT: lw a2, 12(a2)
; RV32ZBB-NEXT: lw a7, 8(a1)
; RV32ZBB-NEXT: lw t0, 12(a1)
-; RV32ZBB-NEXT: lw a2, 0(a1)
+; RV32ZBB-NEXT: lw a6, 0(a1)
; RV32ZBB-NEXT: lw a1, 4(a1)
; RV32ZBB-NEXT: sltu t1, a7, a5
-; RV32ZBB-NEXT: sub t0, t0, a6
-; RV32ZBB-NEXT: sltu a6, a2, a4
+; RV32ZBB-NEXT: sub t0, t0, a2
+; RV32ZBB-NEXT: sltu a2, a6, a3
; RV32ZBB-NEXT: sub t0, t0, t1
-; RV32ZBB-NEXT: mv t1, a6
-; RV32ZBB-NEXT: beq a1, a3, .LBB31_2
+; RV32ZBB-NEXT: mv t1, a2
+; RV32ZBB-NEXT: beq a1, a4, .LBB31_2
; RV32ZBB-NEXT: # %bb.1:
-; RV32ZBB-NEXT: sltu t1, a1, a3
+; RV32ZBB-NEXT: sltu t1, a1, a4
; RV32ZBB-NEXT: .LBB31_2:
; RV32ZBB-NEXT: sub a5, a7, a5
-; RV32ZBB-NEXT: sub a3, a1, a3
-; RV32ZBB-NEXT: sltu a1, a5, t1
+; RV32ZBB-NEXT: sub a1, a1, a4
+; RV32ZBB-NEXT: sltu a4, a5, t1
; RV32ZBB-NEXT: sub a5, a5, t1
-; RV32ZBB-NEXT: sub a1, t0, a1
-; RV32ZBB-NEXT: sub a3, a3, a6
-; RV32ZBB-NEXT: sub a2, a2, a4
-; RV32ZBB-NEXT: bgez a1, .LBB31_4
+; RV32ZBB-NEXT: sub a4, t0, a4
+; RV32ZBB-NEXT: sub a2, a1, a2
+; RV32ZBB-NEXT: sub a1, a6, a3
+; RV32ZBB-NEXT: bgez a4, .LBB31_4
; RV32ZBB-NEXT: # %bb.3:
-; RV32ZBB-NEXT: snez a4, a3
-; RV32ZBB-NEXT: snez a6, a2
+; RV32ZBB-NEXT: snez a3, a2
+; RV32ZBB-NEXT: snez a6, a1
; RV32ZBB-NEXT: neg a7, a5
; RV32ZBB-NEXT: snez a5, a5
+; RV32ZBB-NEXT: or a3, a6, a3
+; RV32ZBB-NEXT: add a4, a4, a5
+; RV32ZBB-NEXT: add a2, a2, a6
+; RV32ZBB-NEXT: sltu a6, a7, a3
+; RV32ZBB-NEXT: neg a4, a4
+; RV32ZBB-NEXT: sub a5, a7, a3
; RV32ZBB-NEXT: neg a2, a2
-; RV32ZBB-NEXT: or a4, a6, a4
-; RV32ZBB-NEXT: add a1, a1, a5
-; RV32ZBB-NEXT: add a3, a3, a6
-; RV32ZBB-NEXT: sltu a6, a7, a4
+; RV32ZBB-NEXT: sub a4, a4, a6
; RV32ZBB-NEXT: neg a1, a1
-; RV32ZBB-NEXT: sub a5, a7, a4
-; RV32ZBB-NEXT: sub a1, a1, a6
-; RV32ZBB-NEXT: neg a3, a3
; RV32ZBB-NEXT: .LBB31_4:
-; RV32ZBB-NEXT: sw a2, 0(a0)
-; RV32ZBB-NEXT: sw a3, 4(a0)
+; RV32ZBB-NEXT: sw a1, 0(a0)
+; RV32ZBB-NEXT: sw a2, 4(a0)
; RV32ZBB-NEXT: sw a5, 8(a0)
-; RV32ZBB-NEXT: sw a1, 12(a0)
+; RV32ZBB-NEXT: sw a4, 12(a0)
; RV32ZBB-NEXT: ret
;
; RV64ZBB-LABEL: abd_subnsw_i128:
@@ -2142,50 +2142,50 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
; RV32I-LABEL: abd_subnsw_i128_undef:
; RV32I: # %bb.0:
-; RV32I-NEXT: lw a4, 0(a2)
-; RV32I-NEXT: lw a3, 4(a2)
+; RV32I-NEXT: lw a3, 0(a2)
+; RV32I-NEXT: lw a4, 4(a2)
; RV32I-NEXT: lw a5, 8(a2)
-; RV32I-NEXT: lw a6, 12(a2)
+; RV32I-NEXT: lw a2, 12(a2)
; RV32I-NEXT: lw a7, 8(a1)
; RV32I-NEXT: lw t0, 12(a1)
-; RV32I-NEXT: lw a2, 0(a1)
+; RV32I-NEXT: lw a6, 0(a1)
; RV32I-NEXT: lw a1, 4(a1)
; RV32I-NEXT: sltu t1, a7, a5
-; RV32I-NEXT: sub t0, t0, a6
-; RV32I-NEXT: sltu a6, a2, a4
+; RV32I-NEXT: sub t0, t0, a2
+; RV32I-NEXT: sltu a2, a6, a3
; RV32I-NEXT: sub t0, t0, t1
-; RV32I-NEXT: mv t1, a6
-; RV32I-NEXT: beq a1, a3, .LBB32_2
+; RV32I-NEXT: mv t1, a2
+; RV32I-NEXT: beq a1, a4, .LBB32_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: sltu t1, a1, a3
+; RV32I-NEXT: sltu t1, a1, a4
; RV32I-NEXT: .LBB32_2:
; RV32I-NEXT: sub a5, a7, a5
-; RV32I-NEXT: sub a3, a1, a3
-; RV32I-NEXT: sltu a1, a5, t1
+; RV32I-NEXT: sub a1, a1, a4
+; RV32I-NEXT: sltu a4, a5, t1
; RV32I-NEXT: sub a5, a5, t1
-; RV32I-NEXT: sub a1, t0, a1
-; RV32I-NEXT: sub a3, a3, a6
-; RV32I-NEXT: sub a2, a2, a4
-; RV32I-NEXT: bgez a1, .LBB32_4
+; RV32I-NEXT: sub a4, t0, a4
+; RV32I-NEXT: sub a2, a1, a2
+; RV32I-NEXT: sub a1, a6, a3
+; RV32I-NEXT: bgez a4, .LBB32_4
; RV32I-NEXT: # %bb.3:
-; RV32I-NEXT: snez a4, a3
-; RV32I-NEXT: snez a6, a2
+; RV32I-NEXT: snez a3, a2
+; RV32I-NEXT: snez a6, a1
; RV32I-NEXT: neg a7, a5
; RV32I-NEXT: snez a5, a5
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: add a4, a4, a5
+; RV32I-NEXT: add a2, a2, a6
+; RV32I-NEXT: sltu a6, a7, a3
+; RV32I-NEXT: neg a4, a4
+; RV32I-NEXT: sub a5, a7, a3
; RV32I-NEXT: neg a2, a2
-; RV32I-NEXT: or a4, a6, a4
-; RV32I-NEXT: add a1, a1, a5
-; RV32I-NEXT: add a3, a3, a6
-; RV32I-NEXT: sltu a6, a7, a4
+; RV32I-NEXT: sub a4, a4, a6
; RV32I-NEXT: neg a1, a1
-; RV32I-NEXT: sub a5, a7, a4
-; RV32I-NEXT: sub a1, a1, a6
-; RV32I-NEXT: neg a3, a3
; RV32I-NEXT: .LBB32_4:
-; RV32I-NEXT: sw a2, 0(a0)
-; RV32I-NEXT: sw a3, 4(a0)
+; RV32I-NEXT: sw a1, 0(a0)
+; RV32I-NEXT: sw a2, 4(a0)
; RV32I-NEXT: sw a5, 8(a0)
-; RV32I-NEXT: sw a1, 12(a0)
+; RV32I-NEXT: sw a4, 12(a0)
; RV32I-NEXT: ret
;
; RV64I-LABEL: abd_subnsw_i128_undef:
@@ -2205,50 +2205,50 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
;
; RV32ZBB-LABEL: abd_subnsw_i128_undef:
; RV32ZBB: # %bb.0:
-; RV32ZBB-NEXT: lw a4, 0(a2)
-; RV32ZBB-NEXT: lw a3, 4(a2)
+; RV32ZBB-NEXT: lw a3, 0(a2)
+; RV32ZBB-NEXT: lw a4, 4(a2)
; RV32ZBB-NEXT: lw a5, 8(a2)
-; RV32ZBB-NEXT: lw a6, 12(a2)
+; RV32ZBB-NEXT: lw a2, 12(a2)
; RV32ZBB-NEXT: lw a7, 8(a1)
; RV32ZBB-NEXT: lw t0, 12(a1)
-; RV32ZBB-NEXT: lw a2, 0(a1)
+; RV32ZBB-NEXT: lw a6, 0(a1)
; RV32ZBB-NEXT: lw a1, 4(a1)
; RV32ZBB-NEXT: sltu t1, a7, a5
-; RV32ZBB-NEXT: sub t0, t0, a6
-; RV32ZBB-NEXT: sltu a6, a2, a4
+; RV32ZBB-NEXT: sub t0, t0, a2
+; RV32ZBB-NEXT: sltu a2, a6, a3
; RV32ZBB-NEXT: sub t0, t0, t1
-; RV32ZBB-NEXT: mv t1, a6
-; RV32ZBB-NEXT: beq a1, a3, .LBB32_2
+; RV32ZBB-NEXT: mv t1, a2
+; RV32ZBB-NEXT: beq a1, a4, .LBB32_2
; RV32ZBB-NEXT: # %bb.1:
-; RV32ZBB-NEXT: sltu t1, a1, a3
+; RV32ZBB-NEXT: sltu t1, a1, a4
; RV32ZBB-NEXT: .LBB32_2:
; RV32ZBB-NEXT: sub a5, a7, a5
-; RV32ZBB-NEXT: sub a3, a1, a3
-; RV32ZBB-NEXT: sltu a1, a5, t1
+; RV32ZBB-NEXT: sub a1, a1, a4
+; RV32ZBB-NEXT: sltu a4, a5, t1
; RV32ZBB-NEXT: sub a5, a5, t1
-; RV32ZBB-NEXT: sub a1, t0, a1
-; RV32ZBB-NEXT: sub a3, a3, a6
-; RV32ZBB-NEXT: sub a2, a2, a4
-; RV32ZBB-NEXT: bgez a1, .LBB32_4
+; RV32ZBB-NEXT: sub a4, t0, a4
+; RV32ZBB-NEXT: sub a2, a1, a2
+; RV32ZBB-NEXT: sub a1, a6, a3
+; RV32ZBB-NEXT: bgez a4, .LBB32_4
; RV32ZBB-NEXT: # %bb.3:
-; RV32ZBB-NEXT: snez a4, a3
-; RV32ZBB-NEXT: snez a6, a2
+; RV32ZBB-NEXT: snez a3, a2
+; RV32ZBB-NEXT: snez a6, a1
; RV32ZBB-NEXT: neg a7, a5
; RV32ZBB-NEXT: snez a5, a5
+; RV32ZBB-NEXT: or a3, a6, a3
+; RV32ZBB-NEXT: add a4, a4, a5
+; RV32ZBB-NEXT: add a2, a2, a6
+; RV32ZBB-NEXT: sltu a6, a7, a3
+; RV32ZBB-NEXT: neg a4, a4
+; RV32ZBB-NEXT: sub a5, a7, a3
; RV32ZBB-NEXT: neg a2, a2
-; RV32ZBB-NEXT: or a4, a6, a4
-; RV32ZBB-NEXT: add a1, a1, a5
-; RV32ZBB-NEXT: add a3, a3, a6
-; RV32ZBB-NEXT: sltu a6, a7, a4
+; RV32ZBB-NEXT: sub a4, a4, a6
; RV32ZBB-NEXT: neg a1, a1
-; RV32ZBB-NEXT: sub a5, a7, a4
-; RV32ZBB-NEXT: sub a1, a1, a6
-; RV32ZBB-NEXT: neg a3, a3
; RV32ZBB-NEXT: .LBB32_4:
-; RV32ZBB-NEXT: sw a2, 0(a0)
-; RV32ZBB-NEXT: sw a3, 4(a0)
+; RV32ZBB-NEXT: sw a1, 0(a0)
+; RV32ZBB-NEXT: sw a2, 4(a0)
; RV32ZBB-NEXT: sw a5, 8(a0)
-; RV32ZBB-NEXT: sw a1, 12(a0)
+; RV32ZBB-NEXT: sw a4, 12(a0)
; RV32ZBB-NEXT: ret
;
; RV64ZBB-LABEL: abd_subnsw_i128_undef:
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 117e3e4aac45d..519f1e851a832 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -1110,15 +1110,15 @@ define i64 @stest_f64i64(double %x) {
; RV32IF-NEXT: .LBB18_3: # %entry
; RV32IF-NEXT: slti a6, a4, 0
; RV32IF-NEXT: .LBB18_4: # %entry
-; RV32IF-NEXT: addi a7, a6, -1
-; RV32IF-NEXT: neg t0, a6
+; RV32IF-NEXT: neg a7, a6
+; RV32IF-NEXT: addi t0, a6, -1
; RV32IF-NEXT: bnez a6, .LBB18_6
; RV32IF-NEXT: # %bb.5: # %entry
; RV32IF-NEXT: mv a1, a5
; RV32IF-NEXT: .LBB18_6: # %entry
-; RV32IF-NEXT: or a3, a7, a3
-; RV32IF-NEXT: and a4, t0, a4
-; RV32IF-NEXT: and a2, t0, a2
+; RV32IF-NEXT: or a3, t0, a3
+; RV32IF-NEXT: and a4, a7, a4
+; RV32IF-NEXT: and a2, a7, a2
; RV32IF-NEXT: beq a1, a0, .LBB18_8
; RV32IF-NEXT: # %bb.7: # %entry
; RV32IF-NEXT: sltu a0, a0, a1
@@ -1213,15 +1213,15 @@ define i64 @stest_f64i64(double %x) {
; RV32IFD-NEXT: .LBB18_3: # %entry
; RV32IFD-NEXT: slti a6, a4, 0
; RV32IFD-NEXT: .LBB18_4: # %entry
-; RV32IFD-NEXT: addi a7, a6, -1
-; RV32IFD-NEXT: neg t0, a6
+; RV32IFD-NEXT: neg a7, a6
+; RV32IFD-NEXT: addi t0, a6, -1
; RV32IFD-NEXT: bnez a6, .LBB18_6
; RV32IFD-NEXT: # %bb.5: # %entry
; RV32IFD-NEXT: mv a1, a5
; RV32IFD-NEXT: .LBB18_6: # %entry
-; RV32IFD-NEXT: or a3, a7, a3
-; RV32IFD-NEXT: and a4, t0, a4
-; RV32IFD-NEXT: and a2, t0, a2
+; RV32IFD-NEXT: or a3, t0, a3
+; RV32IFD-NEXT: and a4, a7, a4
+; RV32IFD-NEXT: and a2, a7, a2
; RV32IFD-NEXT: beq a1, a0, .LBB18_8
; RV32IFD-NEXT: # %bb.7: # %entry
; RV32IFD-NEXT: sltu a0, a0, a1
@@ -1378,8 +1378,8 @@ define i64 @ustest_f64i64(double %x) {
; RV32IF-NEXT: # %bb.4: # %entry
; RV32IF-NEXT: li a0, 1
; RV32IF-NEXT: .LBB20_5: # %entry
-; RV32IF-NEXT: lw a3, 8(sp)
-; RV32IF-NEXT: lw a4, 12(sp)
+; RV32IF-NEXT: lw a4, 8(sp)
+; RV32IF-NEXT: lw a3, 12(sp)
; RV32IF-NEXT: and a5, a2, a1
; RV32IF-NEXT: beqz a5, .LBB20_7
; RV32IF-NEXT: # %bb.6: # %entry
@@ -1393,12 +1393,12 @@ define i64 @ustest_f64i64(double %x) {
; RV32IF-NEXT: and a2, a2, a3
; RV32IF-NEXT: bnez a0, .LBB20_10
; RV32IF-NEXT: # %bb.9:
-; RV32IF-NEXT: or a0, a2, a4
+; RV32IF-NEXT: or a0, a4, a2
; RV32IF-NEXT: snez a1, a0
; RV32IF-NEXT: .LBB20_10: # %entry
; RV32IF-NEXT: neg a1, a1
-; RV32IF-NEXT: and a0, a1, a2
-; RV32IF-NEXT: and a1, a1, a4
+; RV32IF-NEXT: and a0, a1, a4
+; RV32IF-NEXT: and a1, a1, a2
; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IF-NEXT: .cfi_restore ra
; RV32IF-NEXT: addi sp, sp, 32
@@ -1461,8 +1461,8 @@ define i64 @ustest_f64i64(double %x) {
; RV32IFD-NEXT: # %bb.4: # %entry
; RV32IFD-NEXT: li a0, 1
; RV32IFD-NEXT: .LBB20_5: # %entry
-; RV32IFD-NEXT: lw a3, 8(sp)
-; RV32IFD-NEXT: lw a4, 12(sp)
+; RV32IFD-NEXT: lw a4, 8(sp)
+; RV32IFD-NEXT: lw a3, 12(sp)
; RV32IFD-NEXT: and a5, a2, a1
; RV32IFD-NEXT: beqz a5, .LBB20_7
; RV32IFD-NEXT: # %bb.6: # %entry
@@ -1476,12 +1476,12 @@ define i64 @ustest_f64i64(double %x) {
; RV32IFD-NEXT: and a2, a2, a3
; RV32IFD-NEXT: bnez a0, .LBB20_10
; RV32IFD-NEXT: # %bb.9:
-; RV32IFD-NEXT: or a0, a2, a4
+; RV32IFD-NEXT: or a0, a4, a2
; RV32IFD-NEXT: snez a1, a0
; RV32IFD-NEXT: .LBB20_10: # %entry
; RV32IFD-NEXT: neg a1, a1
-; RV32IFD-NEXT: and a0, a1, a2
-; RV32IFD-NEXT: and a1, a1, a4
+; RV32IFD-NEXT: and a0, a1, a4
+; RV32IFD-NEXT: and a1, a1, a2
; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: .cfi_restore ra
; RV32IFD-NEXT: addi sp, sp, 32
@@ -1525,15 +1525,15 @@ define i64 @stest_f32i64(float %x) {
; RV32-NEXT: .LBB21_3: # %entry
; RV32-NEXT: slti a6, a4, 0
; RV32-NEXT: .LBB21_4: # %entry
-; RV32-NEXT: addi a7, a6, -1
-; RV32-NEXT: neg t0, a6
+; RV32-NEXT: neg a7, a6
+; RV32-NEXT: addi t0, a6, -1
; RV32-NEXT: bnez a6, .LBB21_6
; RV32-NEXT: # %bb.5: # %entry
; RV32-NEXT: mv a1, a5
; RV32-NEXT: .LBB21_6: # %entry
-; RV32-NEXT: or a3, a7, a3
-; RV32-NEXT: and a4, t0, a4
-; RV32-NEXT: and a2, t0, a2
+; RV32-NEXT: or a3, t0, a3
+; RV32-NEXT: and a4, a7, a4
+; RV32-NEXT: and a2, a7, a2
; RV32-NEXT: beq a1, a0, .LBB21_8
; RV32-NEXT: # %bb.7: # %entry
; RV32-NEXT: sltu a0, a0, a1
@@ -1658,8 +1658,8 @@ define i64 @ustest_f32i64(float %x) {
; RV32-NEXT: # %bb.4: # %entry
; RV32-NEXT: li a0, 1
; RV32-NEXT: .LBB23_5: # %entry
-; RV32-NEXT: lw a3, 8(sp)
-; RV32-NEXT: lw a4, 12(sp)
+; RV32-NEXT: lw a4, 8(sp)
+; RV32-NEXT: lw a3, 12(sp)
; RV32-NEXT: and a5, a2, a1
; RV32-NEXT: beqz a5, .LBB23_7
; RV32-NEXT: # %bb.6: # %entry
@@ -1673,12 +1673,12 @@ define i64 @ustest_f32i64(float %x) {
; RV32-NEXT: and a2, a2, a3
; RV32-NEXT: bnez a0, .LBB23_10
; RV32-NEXT: # %bb.9:
-; RV32-NEXT: or a0, a2, a4
+; RV32-NEXT: or a0, a4, a2
; RV32-NEXT: snez a1, a0
; RV32-NEXT: .LBB23_10: # %entry
; RV32-NEXT: neg a1, a1
-; RV32-NEXT: and a0, a1, a2
-; RV32-NEXT: and a1, a1, a4
+; RV32-NEXT: and a0, a1, a4
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore ra
; RV32-NEXT: addi sp, sp, 32
@@ -1752,15 +1752,15 @@ define i64 @stest_f16i64(half %x) {
; RV32-NEXT: .LBB24_3: # %entry
; RV32-NEXT: slti a6, a4, 0
; RV32-NEXT: .LBB24_4: # %entry
-; RV32-NEXT: addi a7, a6, -1
-; RV32-NEXT: neg t0, a6
+; RV32-NEXT: neg a7, a6
+; RV32-NEXT: addi t0, a6, -1
; RV32-NEXT: bnez a6, .LBB24_6
; RV32-NEXT: # %bb.5: # %entry
; RV32-NEXT: mv a1, a5
; RV32-NEXT: .LBB24_6: # %entry
-; RV32-NEXT: or a3, a7, a3
-; RV32-NEXT: and a4, t0, a4
-; RV32-NEXT: and a2, t0, a2
+; RV32-NEXT: or a3, t0, a3
+; RV32-NEXT: and a4, a7, a4
+; RV32-NEXT: and a2, a7, a2
; RV32-NEXT: beq a1, a0, .LBB24_8
; RV32-NEXT: # %bb.7: # %entry
; RV32-NEXT: sltu a0, a0, a1
@@ -1921,8 +1921,8 @@ define i64 @ustest_f16i64(half %x) {
; RV32-NEXT: # %bb.4: # %entry
; RV32-NEXT: li a0, 1
; RV32-NEXT: .LBB26_5: # %entry
-; RV32-NEXT: lw a3, 8(sp)
-; RV32-NEXT: lw a4, 12(sp)
+; RV32-NEXT: lw a4, 8(sp)
+; RV32-NEXT: lw a3, 12(sp)
; RV32-NEXT: and a5, a2, a1
; RV32-NEXT: beqz a5, .LBB26_7
; RV32-NEXT: # %bb.6: # %entry
@@ -1936,12 +1936,12 @@ define i64 @ustest_f16i64(half %x) {
; RV32-NEXT: and a2, a2, a3
; RV32-NEXT: bnez a0, .LBB26_10
; RV32-NEXT: # %bb.9:
-; RV32-NEXT: or a0, a2, a4
+; RV32-NEXT: or a0, a4, a2
; RV32-NEXT: snez a1, a0
; RV32-NEXT: .LBB26_10: # %entry
; RV32-NEXT: neg a1, a1
-; RV32-NEXT: and a0, a1, a2
-; RV32-NEXT: and a1, a1, a4
+; RV32-NEXT: and a0, a1, a4
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore ra
; RV32-NEXT: addi sp, sp, 32
@@ -3046,15 +3046,15 @@ define i64 @stest_f64i64_mm(double %x) {
; RV32IF-NEXT: .LBB45_3: # %entry
; RV32IF-NEXT: slti a6, a4, 0
; RV32IF-NEXT: .LBB45_4: # %entry
-; RV32IF-NEXT: addi a7, a6, -1
-; RV32IF-NEXT: neg t0, a6
+; RV32IF-NEXT: neg a7, a6
+; RV32IF-NEXT: addi t0, a6, -1
; RV32IF-NEXT: bnez a6, .LBB45_6
; RV32IF-NEXT: # %bb.5: # %entry
; RV32IF-NEXT: mv a1, a5
; RV32IF-NEXT: .LBB45_6: # %entry
-; RV32IF-NEXT: or a3, a7, a3
-; RV32IF-NEXT: and a4, t0, a4
-; RV32IF-NEXT: and a2, t0, a2
+; RV32IF-NEXT: or a3, t0, a3
+; RV32IF-NEXT: and a4, a7, a4
+; RV32IF-NEXT: and a2, a7, a2
; RV32IF-NEXT: beq a1, a0, .LBB45_8
; RV32IF-NEXT: # %bb.7: # %entry
; RV32IF-NEXT: sltu a0, a0, a1
@@ -3149,15 +3149,15 @@ define i64 @stest_f64i64_mm(double %x) {
; RV32IFD-NEXT: .LBB45_3: # %entry
; RV32IFD-NEXT: slti a6, a4, 0
; RV32IFD-NEXT: .LBB45_4: # %entry
-; RV32IFD-NEXT: addi a7, a6, -1
-; RV32IFD-NEXT: neg t0, a6
+; RV32IFD-NEXT: neg a7, a6
+; RV32IFD-NEXT: addi t0, a6, -1
; RV32IFD-NEXT: bnez a6, .LBB45_6
; RV32IFD-NEXT: # %bb.5: # %entry
; RV32IFD-NEXT: mv a1, a5
; RV32IFD-NEXT: .LBB45_6: # %entry
-; RV32IFD-NEXT: or a3, a7, a3
-; RV32IFD-NEXT: and a4, t0, a4
-; RV32IFD-NEXT: and a2, t0, a2
+; RV32IFD-NEXT: or a3, t0, a3
+; RV32IFD-NEXT: and a4, a7, a4
+; RV32IFD-NEXT: and a2, a7, a2
; RV32IFD-NEXT: beq a1, a0, .LBB45_8
; RV32IFD-NEXT: # %bb.7: # %entry
; RV32IFD-NEXT: sltu a0, a0, a1
@@ -3292,30 +3292,30 @@ define i64 @ustest_f64i64_mm(double %x) {
; RV32IF-NEXT: mv a1, a0
; RV32IF-NEXT: addi a0, sp, 8
; RV32IF-NEXT: call __fixdfti
-; RV32IF-NEXT: lw a0, 20(sp)
-; RV32IF-NEXT: lw a1, 8(sp)
-; RV32IF-NEXT: lw a2, 12(sp)
+; RV32IF-NEXT: lw a0, 8(sp)
+; RV32IF-NEXT: lw a1, 12(sp)
+; RV32IF-NEXT: lw a2, 20(sp)
; RV32IF-NEXT: lw a3, 16(sp)
-; RV32IF-NEXT: beqz a0, .LBB47_2
+; RV32IF-NEXT: beqz a2, .LBB47_2
; RV32IF-NEXT: # %bb.1: # %entry
-; RV32IF-NEXT: slti a4, a0, 0
+; RV32IF-NEXT: slti a4, a2, 0
; RV32IF-NEXT: j .LBB47_3
; RV32IF-NEXT: .LBB47_2:
; RV32IF-NEXT: seqz a4, a3
; RV32IF-NEXT: .LBB47_3: # %entry
; RV32IF-NEXT: xori a3, a3, 1
-; RV32IF-NEXT: or a3, a3, a0
+; RV32IF-NEXT: or a3, a3, a2
; RV32IF-NEXT: seqz a3, a3
; RV32IF-NEXT: addi a3, a3, -1
; RV32IF-NEXT: and a3, a3, a4
; RV32IF-NEXT: neg a3, a3
-; RV32IF-NEXT: and a2, a3, a2
; RV32IF-NEXT: and a1, a3, a1
; RV32IF-NEXT: and a0, a3, a0
-; RV32IF-NEXT: slti a0, a0, 0
-; RV32IF-NEXT: addi a3, a0, -1
-; RV32IF-NEXT: and a0, a3, a1
-; RV32IF-NEXT: and a1, a3, a2
+; RV32IF-NEXT: and a2, a3, a2
+; RV32IF-NEXT: slti a2, a2, 0
+; RV32IF-NEXT: addi a2, a2, -1
+; RV32IF-NEXT: and a0, a2, a0
+; RV32IF-NEXT: and a1, a2, a1
; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IF-NEXT: .cfi_restore ra
; RV32IF-NEXT: addi sp, sp, 32
@@ -3354,30 +3354,30 @@ define i64 @ustest_f64i64_mm(double %x) {
; RV32IFD-NEXT: .cfi_offset ra, -4
; RV32IFD-NEXT: addi a0, sp, 8
; RV32IFD-NEXT: call __fixdfti
-; RV32IFD-NEXT: lw a0, 20(sp)
-; RV32IFD-NEXT: lw a1, 8(sp)
-; RV32IFD-NEXT: lw a2, 12(sp)
+; RV32IFD-NEXT: lw a0, 8(sp)
+; RV32IFD-NEXT: lw a1, 12(sp)
+; RV32IFD-NEXT: lw a2, 20(sp)
; RV32IFD-NEXT: lw a3, 16(sp)
-; RV32IFD-NEXT: beqz a0, .LBB47_2
+; RV32IFD-NEXT: beqz a2, .LBB47_2
; RV32IFD-NEXT: # %bb.1: # %entry
-; RV32IFD-NEXT: slti a4, a0, 0
+; RV32IFD-NEXT: slti a4, a2, 0
; RV32IFD-NEXT: j .LBB47_3
; RV32IFD-NEXT: .LBB47_2:
; RV32IFD-NEXT: seqz a4, a3
; RV32IFD-NEXT: .LBB47_3: # %entry
; RV32IFD-NEXT: xori a3, a3, 1
-; RV32IFD-NEXT: or a3, a3, a0
+; RV32IFD-NEXT: or a3, a3, a2
; RV32IFD-NEXT: seqz a3, a3
; RV32IFD-NEXT: addi a3, a3, -1
; RV32IFD-NEXT: and a3, a3, a4
; RV32IFD-NEXT: neg a3, a3
-; RV32IFD-NEXT: and a2, a3, a2
; RV32IFD-NEXT: and a1, a3, a1
; RV32IFD-NEXT: and a0, a3, a0
-; RV32IFD-NEXT: slti a0, a0, 0
-; RV32IFD-NEXT: addi a3, a0, -1
-; RV32IFD-NEXT: and a0, a3, a1
-; RV32IFD-NEXT: and a1, a3, a2
+; RV32IFD-NEXT: and a2, a3, a2
+; RV32IFD-NEXT: slti a2, a2, 0
+; RV32IFD-NEXT: addi a2, a2, -1
+; RV32IFD-NEXT: and a0, a2, a0
+; RV32IFD-NEXT: and a1, a2, a1
; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: .cfi_restore ra
; RV32IFD-NEXT: addi sp, sp, 32
@@ -3419,15 +3419,15 @@ define i64 @stest_f32i64_mm(float %x) {
; RV32-NEXT: .LBB48_3: # %entry
; RV32-NEXT: slti a6, a4, 0
; RV32-NEXT: .LBB48_4: # %entry
-; RV32-NEXT: addi a7, a6, -1
-; RV32-NEXT: neg t0, a6
+; RV32-NEXT: neg a7, a6
+; RV32-NEXT: addi t0, a6, -1
; RV32-NEXT: bnez a6, .LBB48_6
; RV32-NEXT: # %bb.5: # %entry
; RV32-NEXT: mv a1, a5
; RV32-NEXT: .LBB48_6: # %entry
-; RV32-NEXT: or a3, a7, a3
-; RV32-NEXT: and a4, t0, a4
-; RV32-NEXT: and a2, t0, a2
+; RV32-NEXT: or a3, t0, a3
+; RV32-NEXT: and a4, a7, a4
+; RV32-NEXT: and a2, a7, a2
; RV32-NEXT: beq a1, a0, .LBB48_8
; RV32-NEXT: # %bb.7: # %entry
; RV32-NEXT: sltu a0, a0, a1
@@ -3530,30 +3530,30 @@ define i64 @ustest_f32i64_mm(float %x) {
; RV32-NEXT: .cfi_offset ra, -4
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: call __fixsfti
-; RV32-NEXT: lw a0, 20(sp)
-; RV32-NEXT: lw a1, 8(sp)
-; RV32-NEXT: lw a2, 12(sp)
+; RV32-NEXT: lw a0, 8(sp)
+; RV32-NEXT: lw a1, 12(sp)
+; RV32-NEXT: lw a2, 20(sp)
; RV32-NEXT: lw a3, 16(sp)
-; RV32-NEXT: beqz a0, .LBB50_2
+; RV32-NEXT: beqz a2, .LBB50_2
; RV32-NEXT: # %bb.1: # %entry
-; RV32-NEXT: slti a4, a0, 0
+; RV32-NEXT: slti a4, a2, 0
; RV32-NEXT: j .LBB50_3
; RV32-NEXT: .LBB50_2:
; RV32-NEXT: seqz a4, a3
; RV32-NEXT: .LBB50_3: # %entry
; RV32-NEXT: xori a3, a3, 1
-; RV32-NEXT: or a3, a3, a0
+; RV32-NEXT: or a3, a3, a2
; RV32-NEXT: seqz a3, a3
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: and a3, a3, a4
; RV32-NEXT: neg a3, a3
-; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: and a1, a3, a1
; RV32-NEXT: and a0, a3, a0
-; RV32-NEXT: slti a0, a0, 0
-; RV32-NEXT: addi a3, a0, -1
-; RV32-NEXT: and a0, a3, a1
-; RV32-NEXT: and a1, a3, a2
+; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: slti a2, a2, 0
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a0, a2, a0
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore ra
; RV32-NEXT: addi sp, sp, 32
@@ -3620,15 +3620,15 @@ define i64 @stest_f16i64_mm(half %x) {
; RV32-NEXT: .LBB51_3: # %entry
; RV32-NEXT: slti a6, a4, 0
; RV32-NEXT: .LBB51_4: # %entry
-; RV32-NEXT: addi a7, a6, -1
-; RV32-NEXT: neg t0, a6
+; RV32-NEXT: neg a7, a6
+; RV32-NEXT: addi t0, a6, -1
; RV32-NEXT: bnez a6, .LBB51_6
; RV32-NEXT: # %bb.5: # %entry
; RV32-NEXT: mv a1, a5
; RV32-NEXT: .LBB51_6: # %entry
-; RV32-NEXT: or a3, a7, a3
-; RV32-NEXT: and a4, t0, a4
-; RV32-NEXT: and a2, t0, a2
+; RV32-NEXT: or a3, t0, a3
+; RV32-NEXT: and a4, a7, a4
+; RV32-NEXT: and a2, a7, a2
; RV32-NEXT: beq a1, a0, .LBB51_8
; RV32-NEXT: # %bb.7: # %entry
; RV32-NEXT: sltu a0, a0, a1
@@ -3767,30 +3767,30 @@ define i64 @ustest_f16i64_mm(half %x) {
; RV32-NEXT: call __extendhfsf2
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: call __fixsfti
-; RV32-NEXT: lw a0, 20(sp)
-; RV32-NEXT: lw a1, 8(sp)
-; RV32-NEXT: lw a2, 12(sp)
+; RV32-NEXT: lw a0, 8(sp)
+; RV32-NEXT: lw a1, 12(sp)
+; RV32-NEXT: lw a2, 20(sp)
; RV32-NEXT: lw a3, 16(sp)
-; RV32-NEXT: beqz a0, .LBB53_2
+; RV32-NEXT: beqz a2, .LBB53_2
; RV32-NEXT: # %bb.1: # %entry
-; RV32-NEXT: slti a4, a0, 0
+; RV32-NEXT: slti a4, a2, 0
; RV32-NEXT: j .LBB53_3
; RV32-NEXT: .LBB53_2:
; RV32-NEXT: seqz a4, a3
; RV32-NEXT: .LBB53_3: # %entry
; RV32-NEXT: xori a3, a3, 1
-; RV32-NEXT: or a3, a3, a0
+; RV32-NEXT: or a3, a3, a2
; RV32-NEXT: seqz a3, a3
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: and a3, a3, a4
; RV32-NEXT: neg a3, a3
-; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: and a1, a3, a1
; RV32-NEXT: and a0, a3, a0
-; RV32-NEXT: slti a0, a0, 0
-; RV32-NEXT: addi a3, a0, -1
-; RV32-NEXT: and a0, a3, a1
-; RV32-NEXT: and a1, a3, a2
+; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: slti a2, a2, 0
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a0, a2, a0
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore ra
; RV32-NEXT: addi sp, sp, 32
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index 774f1a1608821..c157c63722cb4 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -301,58 +301,58 @@ define i64 @select_abs64(i64 %x) {
define i128 @abs128(i128 %x) {
; RV32I-LABEL: abs128:
; RV32I: # %bb.0:
-; RV32I-NEXT: lw a3, 12(a1)
-; RV32I-NEXT: lw a2, 0(a1)
+; RV32I-NEXT: lw a2, 12(a1)
+; RV32I-NEXT: lw a3, 0(a1)
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a1, 8(a1)
-; RV32I-NEXT: bgez a3, .LBB8_2
+; RV32I-NEXT: bgez a2, .LBB8_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: neg a5, a1
; RV32I-NEXT: snez a6, a4
-; RV32I-NEXT: snez a7, a2
+; RV32I-NEXT: snez a7, a3
; RV32I-NEXT: snez a1, a1
; RV32I-NEXT: neg a4, a4
; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: add a1, a3, a1
+; RV32I-NEXT: add a1, a2, a1
; RV32I-NEXT: sub a4, a4, a7
-; RV32I-NEXT: sltu a3, a5, a6
+; RV32I-NEXT: sltu a2, a5, a6
; RV32I-NEXT: neg a7, a1
; RV32I-NEXT: sub a1, a5, a6
-; RV32I-NEXT: sub a3, a7, a3
-; RV32I-NEXT: neg a2, a2
+; RV32I-NEXT: sub a2, a7, a2
+; RV32I-NEXT: neg a3, a3
; RV32I-NEXT: .LBB8_2:
-; RV32I-NEXT: sw a2, 0(a0)
+; RV32I-NEXT: sw a3, 0(a0)
; RV32I-NEXT: sw a4, 4(a0)
; RV32I-NEXT: sw a1, 8(a0)
-; RV32I-NEXT: sw a3, 12(a0)
+; RV32I-NEXT: sw a2, 12(a0)
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: abs128:
; RV32ZBB: # %bb.0:
-; RV32ZBB-NEXT: lw a3, 12(a1)
-; RV32ZBB-NEXT: lw a2, 0(a1)
+; RV32ZBB-NEXT: lw a2, 12(a1)
+; RV32ZBB-NEXT: lw a3, 0(a1)
; RV32ZBB-NEXT: lw a4, 4(a1)
; RV32ZBB-NEXT: lw a1, 8(a1)
-; RV32ZBB-NEXT: bgez a3, .LBB8_2
+; RV32ZBB-NEXT: bgez a2, .LBB8_2
; RV32ZBB-NEXT: # %bb.1:
; RV32ZBB-NEXT: neg a5, a1
; RV32ZBB-NEXT: snez a6, a4
-; RV32ZBB-NEXT: snez a7, a2
+; RV32ZBB-NEXT: snez a7, a3
; RV32ZBB-NEXT: snez a1, a1
; RV32ZBB-NEXT: neg a4, a4
; RV32ZBB-NEXT: or a6, a7, a6
-; RV32ZBB-NEXT: add a1, a3, a1
+; RV32ZBB-NEXT: add a1, a2, a1
; RV32ZBB-NEXT: sub a4, a4, a7
-; RV32ZBB-NEXT: sltu a3, a5, a6
+; RV32ZBB-NEXT: sltu a2, a5, a6
; RV32ZBB-NEXT: neg a7, a1
; RV32ZBB-NEXT: sub a1, a5, a6
-; RV32ZBB-NEXT: sub a3, a7, a3
-; RV32ZBB-NEXT: neg a2, a2
+; RV32ZBB-NEXT: sub a2, a7, a2
+; RV32ZBB-NEXT: neg a3, a3
; RV32ZBB-NEXT: .LBB8_2:
-; RV32ZBB-NEXT: sw a2, 0(a0)
+; RV32ZBB-NEXT: sw a3, 0(a0)
; RV32ZBB-NEXT: sw a4, 4(a0)
; RV32ZBB-NEXT: sw a1, 8(a0)
-; RV32ZBB-NEXT: sw a3, 12(a0)
+; RV32ZBB-NEXT: sw a2, 12(a0)
; RV32ZBB-NEXT: ret
;
; RV64I-LABEL: abs128:
@@ -383,58 +383,58 @@ define i128 @abs128(i128 %x) {
define i128 @select_abs128(i128 %x) {
; RV32I-LABEL: select_abs128:
; RV32I: # %bb.0:
-; RV32I-NEXT: lw a3, 12(a1)
-; RV32I-NEXT: lw a2, 0(a1)
+; RV32I-NEXT: lw a2, 12(a1)
+; RV32I-NEXT: lw a3, 0(a1)
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a1, 8(a1)
-; RV32I-NEXT: bgez a3, .LBB9_2
+; RV32I-NEXT: bgez a2, .LBB9_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: neg a5, a1
; RV32I-NEXT: snez a6, a4
-; RV32I-NEXT: snez a7, a2
+; RV32I-NEXT: snez a7, a3
; RV32I-NEXT: snez a1, a1
; RV32I-NEXT: neg a4, a4
; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: add a1, a3, a1
+; RV32I-NEXT: add a1, a2, a1
; RV32I-NEXT: sub a4, a4, a7
-; RV32I-NEXT: sltu a3, a5, a6
+; RV32I-NEXT: sltu a2, a5, a6
; RV32I-NEXT: neg a7, a1
; RV32I-NEXT: sub a1, a5, a6
-; RV32I-NEXT: sub a3, a7, a3
-; RV32I-NEXT: neg a2, a2
+; RV32I-NEXT: sub a2, a7, a2
+; RV32I-NEXT: neg a3, a3
; RV32I-NEXT: .LBB9_2:
-; RV32I-NEXT: sw a2, 0(a0)
+; RV32I-NEXT: sw a3, 0(a0)
; RV32I-NEXT: sw a4, 4(a0)
; RV32I-NEXT: sw a1, 8(a0)
-; RV32I-NEXT: sw a3, 12(a0)
+; RV32I-NEXT: sw a2, 12(a0)
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: select_abs128:
; RV32ZBB: # %bb.0:
-; RV32ZBB-NEXT: lw a3, 12(a1)
-; RV32ZBB-NEXT: lw a2, 0(a1)
+; RV32ZBB-NEXT: lw a2, 12(a1)
+; RV32ZBB-NEXT: lw a3, 0(a1)
; RV32ZBB-NEXT: lw a4, 4(a1)
; RV32ZBB-NEXT: lw a1, 8(a1)
-; RV32ZBB-NEXT: bgez a3, .LBB9_2
+; RV32ZBB-NEXT: bgez a2, .LBB9_2
; RV32ZBB-NEXT: # %bb.1:
; RV32ZBB-NEXT: neg a5, a1
; RV32ZBB-NEXT: snez a6, a4
-; RV32ZBB-NEXT: snez a7, a2
+; RV32ZBB-NEXT: snez a7, a3
; RV32ZBB-NEXT: snez a1, a1
; RV32ZBB-NEXT: neg a4, a4
; RV32ZBB-NEXT: or a6, a7, a6
-; RV32ZBB-NEXT: add a1, a3, a1
+; RV32ZBB-NEXT: add a1, a2, a1
; RV32ZBB-NEXT: sub a4, a4, a7
-; RV32ZBB-NEXT: sltu a3, a5, a6
+; RV32ZBB-NEXT: sltu a2, a5, a6
; RV32ZBB-NEXT: neg a7, a1
; RV32ZBB-NEXT: sub a1, a5, a6
-; RV32ZBB-NEXT: sub a3, a7, a3
-; RV32ZBB-NEXT: neg a2, a2
+; RV32ZBB-NEXT: sub a2, a7, a2
+; RV32ZBB-NEXT: neg a3, a3
; RV32ZBB-NEXT: .LBB9_2:
-; RV32ZBB-NEXT: sw a2, 0(a0)
+; RV32ZBB-NEXT: sw a3, 0(a0)
; RV32ZBB-NEXT: sw a4, 4(a0)
; RV32ZBB-NEXT: sw a1, 8(a0)
-; RV32ZBB-NEXT: sw a3, 12(a0)
+; RV32ZBB-NEXT: sw a2, 12(a0)
; RV32ZBB-NEXT: ret
;
; RV64I-LABEL: select_abs128:
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 8dd63015971d0..eb8b769b6d083 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -1587,59 +1587,59 @@ define i64 @sub_if_uge_i64(i64 %x, i64 %y) {
define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
; CHECK-LABEL: sub_if_uge_i128:
; CHECK: # %bb.0:
-; CHECK-NEXT: lw a7, 4(a2)
-; CHECK-NEXT: lw a6, 8(a2)
-; CHECK-NEXT: lw t0, 12(a2)
; CHECK-NEXT: lw a3, 4(a1)
-; CHECK-NEXT: lw a4, 12(a1)
-; CHECK-NEXT: lw a5, 8(a1)
-; CHECK-NEXT: beq a4, t0, .LBB53_2
+; CHECK-NEXT: lw a4, 8(a1)
+; CHECK-NEXT: lw a5, 12(a1)
+; CHECK-NEXT: lw a6, 4(a2)
+; CHECK-NEXT: lw t0, 12(a2)
+; CHECK-NEXT: lw a7, 8(a2)
+; CHECK-NEXT: beq a5, t0, .LBB53_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: sltu t1, a4, t0
+; CHECK-NEXT: sltu t1, a5, t0
; CHECK-NEXT: j .LBB53_3
; CHECK-NEXT: .LBB53_2:
-; CHECK-NEXT: sltu t1, a5, a6
+; CHECK-NEXT: sltu t1, a4, a7
; CHECK-NEXT: .LBB53_3:
-; CHECK-NEXT: lw a2, 0(a2)
; CHECK-NEXT: lw a1, 0(a1)
-; CHECK-NEXT: beq a3, a7, .LBB53_5
+; CHECK-NEXT: lw a2, 0(a2)
+; CHECK-NEXT: beq a3, a6, .LBB53_5
; CHECK-NEXT: # %bb.4:
-; CHECK-NEXT: sltu t2, a3, a7
+; CHECK-NEXT: sltu t2, a3, a6
; CHECK-NEXT: j .LBB53_6
; CHECK-NEXT: .LBB53_5:
; CHECK-NEXT: sltu t2, a1, a2
; CHECK-NEXT: .LBB53_6:
-; CHECK-NEXT: xor t3, a4, t0
-; CHECK-NEXT: xor t4, a5, a6
+; CHECK-NEXT: xor t3, a5, t0
+; CHECK-NEXT: xor t4, a4, a7
; CHECK-NEXT: or t3, t4, t3
; CHECK-NEXT: beqz t3, .LBB53_8
; CHECK-NEXT: # %bb.7:
; CHECK-NEXT: mv t2, t1
; CHECK-NEXT: .LBB53_8:
-; CHECK-NEXT: addi t2, t2, -1
-; CHECK-NEXT: and t1, t2, t0
-; CHECK-NEXT: and t0, t2, a2
-; CHECK-NEXT: and a7, t2, a7
+; CHECK-NEXT: addi t3, t2, -1
+; CHECK-NEXT: and t2, t3, t0
+; CHECK-NEXT: and t0, t3, a2
+; CHECK-NEXT: and t1, t3, a6
; CHECK-NEXT: sltu a2, a1, t0
-; CHECK-NEXT: and t2, t2, a6
+; CHECK-NEXT: and a7, t3, a7
; CHECK-NEXT: mv a6, a2
-; CHECK-NEXT: beq a3, a7, .LBB53_10
+; CHECK-NEXT: beq a3, t1, .LBB53_10
; CHECK-NEXT: # %bb.9:
-; CHECK-NEXT: sltu a6, a3, a7
+; CHECK-NEXT: sltu a6, a3, t1
; CHECK-NEXT: .LBB53_10:
-; CHECK-NEXT: sub t3, a5, t2
-; CHECK-NEXT: sltu a5, a5, t2
-; CHECK-NEXT: sub a4, a4, t1
-; CHECK-NEXT: sub a3, a3, a7
+; CHECK-NEXT: sub t3, a4, a7
+; CHECK-NEXT: sltu a4, a4, a7
+; CHECK-NEXT: sub a5, a5, t2
+; CHECK-NEXT: sub a3, a3, t1
; CHECK-NEXT: sub a1, a1, t0
; CHECK-NEXT: sltu a7, t3, a6
-; CHECK-NEXT: sub a4, a4, a5
-; CHECK-NEXT: sub a5, t3, a6
+; CHECK-NEXT: sub a5, a5, a4
+; CHECK-NEXT: sub a4, t3, a6
; CHECK-NEXT: sub a3, a3, a2
-; CHECK-NEXT: sub a2, a4, a7
+; CHECK-NEXT: sub a2, a5, a7
; CHECK-NEXT: sw a1, 0(a0)
; CHECK-NEXT: sw a3, 4(a0)
-; CHECK-NEXT: sw a5, 8(a0)
+; CHECK-NEXT: sw a4, 8(a0)
; CHECK-NEXT: sw a2, 12(a0)
; CHECK-NEXT: ret
%cmp = icmp ult i128 %x, %y
diff --git a/llvm/test/CodeGen/RISCV/rv32zbs.ll b/llvm/test/CodeGen/RISCV/rv32zbs.ll
index 1a3beeb79b85b..e3728bffacf80 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbs.ll
@@ -785,16 +785,16 @@ define i32 @bset_trailing_ones_i32_no_mask(i32 %a) nounwind {
define i64 @bset_trailing_ones_i64_mask(i64 %a) nounwind {
; CHECK-LABEL: bset_trailing_ones_i64_mask:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a2, -1
-; CHECK-NEXT: andi a3, a0, 63
-; CHECK-NEXT: addi a1, a3, -32
-; CHECK-NEXT: sll a0, a2, a0
+; CHECK-NEXT: andi a2, a0, 63
+; CHECK-NEXT: li a3, -1
+; CHECK-NEXT: addi a1, a2, -32
+; CHECK-NEXT: sll a0, a3, a0
; CHECK-NEXT: bltz a1, .LBB43_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: sll a2, a2, a3
+; CHECK-NEXT: sll a2, a3, a2
; CHECK-NEXT: j .LBB43_3
; CHECK-NEXT: .LBB43_2:
-; CHECK-NEXT: not a2, a3
+; CHECK-NEXT: not a2, a2
; CHECK-NEXT: lui a3, 524288
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: srl a2, a3, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index f481f9cff5de1..9ef7f9441171c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -89,17 +89,17 @@ entry:
define <2 x i32> @ustest_f64i32(<2 x double> %x) {
; CHECK-NOV-LABEL: ustest_f64i32:
; CHECK-NOV: # %bb.0: # %entry
-; CHECK-NOV-NEXT: fcvt.l.d a1, fa1, rtz
+; CHECK-NOV-NEXT: fcvt.l.d a0, fa0, rtz
; CHECK-NOV-NEXT: li a2, -1
; CHECK-NOV-NEXT: srli a2, a2, 32
-; CHECK-NOV-NEXT: fcvt.l.d a0, fa0, rtz
-; CHECK-NOV-NEXT: blt a1, a2, .LBB2_2
+; CHECK-NOV-NEXT: fcvt.l.d a1, fa1, rtz
+; CHECK-NOV-NEXT: blt a0, a2, .LBB2_2
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: mv a1, a2
+; CHECK-NOV-NEXT: mv a0, a2
; CHECK-NOV-NEXT: .LBB2_2: # %entry
-; CHECK-NOV-NEXT: blt a0, a2, .LBB2_4
+; CHECK-NOV-NEXT: blt a1, a2, .LBB2_4
; CHECK-NOV-NEXT: # %bb.3: # %entry
-; CHECK-NOV-NEXT: mv a0, a2
+; CHECK-NOV-NEXT: mv a1, a2
; CHECK-NOV-NEXT: .LBB2_4: # %entry
; CHECK-NOV-NEXT: sgtz a2, a1
; CHECK-NOV-NEXT: sgtz a3, a0
@@ -254,50 +254,50 @@ entry:
define <4 x i32> @ustest_f32i32(<4 x float> %x) {
; CHECK-NOV-LABEL: ustest_f32i32:
; CHECK-NOV: # %bb.0: # %entry
-; CHECK-NOV-NEXT: fcvt.l.s a1, fa3, rtz
-; CHECK-NOV-NEXT: li a4, -1
-; CHECK-NOV-NEXT: srli a4, a4, 32
-; CHECK-NOV-NEXT: fcvt.l.s a2, fa2, rtz
-; CHECK-NOV-NEXT: bge a1, a4, .LBB5_6
+; CHECK-NOV-NEXT: fcvt.l.s a1, fa0, rtz
+; CHECK-NOV-NEXT: li a5, -1
+; CHECK-NOV-NEXT: srli a5, a5, 32
+; CHECK-NOV-NEXT: fcvt.l.s a2, fa1, rtz
+; CHECK-NOV-NEXT: bge a1, a5, .LBB5_6
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: fcvt.l.s a3, fa1, rtz
-; CHECK-NOV-NEXT: bge a2, a4, .LBB5_7
+; CHECK-NOV-NEXT: fcvt.l.s a3, fa2, rtz
+; CHECK-NOV-NEXT: bge a2, a5, .LBB5_7
; CHECK-NOV-NEXT: .LBB5_2: # %entry
-; CHECK-NOV-NEXT: fcvt.l.s a5, fa0, rtz
-; CHECK-NOV-NEXT: bge a3, a4, .LBB5_8
+; CHECK-NOV-NEXT: fcvt.l.s a4, fa3, rtz
+; CHECK-NOV-NEXT: bge a3, a5, .LBB5_8
; CHECK-NOV-NEXT: .LBB5_3: # %entry
-; CHECK-NOV-NEXT: blt a5, a4, .LBB5_5
+; CHECK-NOV-NEXT: blt a4, a5, .LBB5_5
; CHECK-NOV-NEXT: .LBB5_4: # %entry
-; CHECK-NOV-NEXT: mv a5, a4
+; CHECK-NOV-NEXT: mv a4, a5
; CHECK-NOV-NEXT: .LBB5_5: # %entry
-; CHECK-NOV-NEXT: sgtz a4, a1
-; CHECK-NOV-NEXT: sgtz a6, a2
-; CHECK-NOV-NEXT: sgtz a7, a3
-; CHECK-NOV-NEXT: sgtz t0, a5
+; CHECK-NOV-NEXT: sgtz a5, a4
+; CHECK-NOV-NEXT: sgtz a6, a3
+; CHECK-NOV-NEXT: sgtz a7, a2
+; CHECK-NOV-NEXT: sgtz t0, a1
; CHECK-NOV-NEXT: neg t0, t0
; CHECK-NOV-NEXT: neg a7, a7
; CHECK-NOV-NEXT: neg a6, a6
-; CHECK-NOV-NEXT: neg a4, a4
-; CHECK-NOV-NEXT: and a5, t0, a5
-; CHECK-NOV-NEXT: and a3, a7, a3
-; CHECK-NOV-NEXT: and a2, a6, a2
-; CHECK-NOV-NEXT: and a1, a4, a1
-; CHECK-NOV-NEXT: sw a5, 0(a0)
-; CHECK-NOV-NEXT: sw a3, 4(a0)
-; CHECK-NOV-NEXT: sw a2, 8(a0)
-; CHECK-NOV-NEXT: sw a1, 12(a0)
+; CHECK-NOV-NEXT: neg a5, a5
+; CHECK-NOV-NEXT: and a1, t0, a1
+; CHECK-NOV-NEXT: and a2, a7, a2
+; CHECK-NOV-NEXT: and a3, a6, a3
+; CHECK-NOV-NEXT: and a4, a5, a4
+; CHECK-NOV-NEXT: sw a1, 0(a0)
+; CHECK-NOV-NEXT: sw a2, 4(a0)
+; CHECK-NOV-NEXT: sw a3, 8(a0)
+; CHECK-NOV-NEXT: sw a4, 12(a0)
; CHECK-NOV-NEXT: ret
; CHECK-NOV-NEXT: .LBB5_6: # %entry
-; CHECK-NOV-NEXT: mv a1, a4
-; CHECK-NOV-NEXT: fcvt.l.s a3, fa1, rtz
-; CHECK-NOV-NEXT: blt a2, a4, .LBB5_2
+; CHECK-NOV-NEXT: mv a1, a5
+; CHECK-NOV-NEXT: fcvt.l.s a3, fa2, rtz
+; CHECK-NOV-NEXT: blt a2, a5, .LBB5_2
; CHECK-NOV-NEXT: .LBB5_7: # %entry
-; CHECK-NOV-NEXT: mv a2, a4
-; CHECK-NOV-NEXT: fcvt.l.s a5, fa0, rtz
-; CHECK-NOV-NEXT: blt a3, a4, .LBB5_3
+; CHECK-NOV-NEXT: mv a2, a5
+; CHECK-NOV-NEXT: fcvt.l.s a4, fa3, rtz
+; CHECK-NOV-NEXT: blt a3, a5, .LBB5_3
; CHECK-NOV-NEXT: .LBB5_8: # %entry
-; CHECK-NOV-NEXT: mv a3, a4
-; CHECK-NOV-NEXT: bge a5, a4, .LBB5_4
+; CHECK-NOV-NEXT: mv a3, a5
+; CHECK-NOV-NEXT: bge a4, a5, .LBB5_4
; CHECK-NOV-NEXT: j .LBB5_5
;
; CHECK-V-LABEL: ustest_f32i32:
@@ -720,8 +720,8 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-NOV-NEXT: .cfi_offset fs2, -64
; CHECK-NOV-NEXT: .cfi_remember_state
; CHECK-NOV-NEXT: lhu s1, 0(a1)
-; CHECK-NOV-NEXT: lhu s2, 8(a1)
-; CHECK-NOV-NEXT: lhu a2, 16(a1)
+; CHECK-NOV-NEXT: lhu a2, 8(a1)
+; CHECK-NOV-NEXT: lhu s2, 16(a1)
; CHECK-NOV-NEXT: lhu s3, 24(a1)
; CHECK-NOV-NEXT: mv s0, a0
; CHECK-NOV-NEXT: fmv.w.x fa0, a2
@@ -730,43 +730,43 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-NOV-NEXT: fmv.w.x fa0, s2
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs1, fa0
-; CHECK-NOV-NEXT: fmv.w.x fa0, s1
+; CHECK-NOV-NEXT: fmv.w.x fa0, s3
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs0, fa0
-; CHECK-NOV-NEXT: fmv.w.x fa0, s3
+; CHECK-NOV-NEXT: fmv.w.x fa0, s1
; CHECK-NOV-NEXT: fcvt.l.s s1, fs2, rtz
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-NOV-NEXT: li a2, -1
-; CHECK-NOV-NEXT: srli a2, a2, 32
-; CHECK-NOV-NEXT: bge a0, a2, .LBB8_6
+; CHECK-NOV-NEXT: li a3, -1
+; CHECK-NOV-NEXT: srli a3, a3, 32
+; CHECK-NOV-NEXT: bge a0, a3, .LBB8_6
; CHECK-NOV-NEXT: # %bb.1: # %entry
; CHECK-NOV-NEXT: fcvt.l.s a1, fs1, rtz
-; CHECK-NOV-NEXT: bge s1, a2, .LBB8_7
+; CHECK-NOV-NEXT: bge s1, a3, .LBB8_7
; CHECK-NOV-NEXT: .LBB8_2: # %entry
-; CHECK-NOV-NEXT: fcvt.l.s a3, fs0, rtz
-; CHECK-NOV-NEXT: bge a1, a2, .LBB8_8
+; CHECK-NOV-NEXT: fcvt.l.s a2, fs0, rtz
+; CHECK-NOV-NEXT: bge a1, a3, .LBB8_8
; CHECK-NOV-NEXT: .LBB8_3: # %entry
-; CHECK-NOV-NEXT: blt a3, a2, .LBB8_5
+; CHECK-NOV-NEXT: blt a2, a3, .LBB8_5
; CHECK-NOV-NEXT: .LBB8_4: # %entry
-; CHECK-NOV-NEXT: mv a3, a2
+; CHECK-NOV-NEXT: mv a2, a3
; CHECK-NOV-NEXT: .LBB8_5: # %entry
-; CHECK-NOV-NEXT: sgtz a2, a0
-; CHECK-NOV-NEXT: sgtz a4, s1
-; CHECK-NOV-NEXT: sgtz a5, a1
-; CHECK-NOV-NEXT: sgtz a6, a3
+; CHECK-NOV-NEXT: sgtz a3, a2
+; CHECK-NOV-NEXT: sgtz a4, a1
+; CHECK-NOV-NEXT: sgtz a5, s1
+; CHECK-NOV-NEXT: sgtz a6, a0
; CHECK-NOV-NEXT: neg a6, a6
; CHECK-NOV-NEXT: neg a5, a5
; CHECK-NOV-NEXT: neg a4, a4
-; CHECK-NOV-NEXT: neg a2, a2
-; CHECK-NOV-NEXT: and a3, a6, a3
-; CHECK-NOV-NEXT: and a1, a5, a1
-; CHECK-NOV-NEXT: and a4, a4, s1
-; CHECK-NOV-NEXT: and a0, a2, a0
-; CHECK-NOV-NEXT: sw a3, 0(s0)
-; CHECK-NOV-NEXT: sw a1, 4(s0)
-; CHECK-NOV-NEXT: sw a4, 8(s0)
-; CHECK-NOV-NEXT: sw a0, 12(s0)
+; CHECK-NOV-NEXT: neg a3, a3
+; CHECK-NOV-NEXT: and a0, a6, a0
+; CHECK-NOV-NEXT: and a5, a5, s1
+; CHECK-NOV-NEXT: and a1, a4, a1
+; CHECK-NOV-NEXT: and a2, a3, a2
+; CHECK-NOV-NEXT: sw a0, 0(s0)
+; CHECK-NOV-NEXT: sw a5, 4(s0)
+; CHECK-NOV-NEXT: sw a1, 8(s0)
+; CHECK-NOV-NEXT: sw a2, 12(s0)
; CHECK-NOV-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
@@ -788,16 +788,16 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-NOV-NEXT: ret
; CHECK-NOV-NEXT: .LBB8_6: # %entry
; CHECK-NOV-NEXT: .cfi_restore_state
-; CHECK-NOV-NEXT: mv a0, a2
+; CHECK-NOV-NEXT: mv a0, a3
; CHECK-NOV-NEXT: fcvt.l.s a1, fs1, rtz
-; CHECK-NOV-NEXT: blt s1, a2, .LBB8_2
+; CHECK-NOV-NEXT: blt s1, a3, .LBB8_2
; CHECK-NOV-NEXT: .LBB8_7: # %entry
-; CHECK-NOV-NEXT: mv s1, a2
-; CHECK-NOV-NEXT: fcvt.l.s a3, fs0, rtz
-; CHECK-NOV-NEXT: blt a1, a2, .LBB8_3
+; CHECK-NOV-NEXT: mv s1, a3
+; CHECK-NOV-NEXT: fcvt.l.s a2, fs0, rtz
+; CHECK-NOV-NEXT: blt a1, a3, .LBB8_3
; CHECK-NOV-NEXT: .LBB8_8: # %entry
-; CHECK-NOV-NEXT: mv a1, a2
-; CHECK-NOV-NEXT: bge a3, a2, .LBB8_4
+; CHECK-NOV-NEXT: mv a1, a3
+; CHECK-NOV-NEXT: bge a2, a3, .LBB8_4
; CHECK-NOV-NEXT: j .LBB8_5
;
; CHECK-V-LABEL: ustest_f16i32:
@@ -977,17 +977,17 @@ entry:
define <2 x i16> @ustest_f64i16(<2 x double> %x) {
; CHECK-NOV-LABEL: ustest_f64i16:
; CHECK-NOV: # %bb.0: # %entry
-; CHECK-NOV-NEXT: fcvt.w.d a1, fa1, rtz
+; CHECK-NOV-NEXT: fcvt.w.d a0, fa0, rtz
; CHECK-NOV-NEXT: lui a2, 16
; CHECK-NOV-NEXT: addi a2, a2, -1
-; CHECK-NOV-NEXT: fcvt.w.d a0, fa0, rtz
-; CHECK-NOV-NEXT: blt a1, a2, .LBB11_2
+; CHECK-NOV-NEXT: fcvt.w.d a1, fa1, rtz
+; CHECK-NOV-NEXT: blt a0, a2, .LBB11_2
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: mv a1, a2
+; CHECK-NOV-NEXT: mv a0, a2
; CHECK-NOV-NEXT: .LBB11_2: # %entry
-; CHECK-NOV-NEXT: blt a0, a2, .LBB11_4
+; CHECK-NOV-NEXT: blt a1, a2, .LBB11_4
; CHECK-NOV-NEXT: # %bb.3: # %entry
-; CHECK-NOV-NEXT: mv a0, a2
+; CHECK-NOV-NEXT: mv a1, a2
; CHECK-NOV-NEXT: .LBB11_4: # %entry
; CHECK-NOV-NEXT: sgtz a2, a1
; CHECK-NOV-NEXT: sgtz a3, a0
@@ -1146,50 +1146,50 @@ entry:
define <4 x i16> @ustest_f32i16(<4 x float> %x) {
; CHECK-NOV-LABEL: ustest_f32i16:
; CHECK-NOV: # %bb.0: # %entry
-; CHECK-NOV-NEXT: fcvt.w.s a1, fa3, rtz
-; CHECK-NOV-NEXT: lui a4, 16
-; CHECK-NOV-NEXT: addi a4, a4, -1
-; CHECK-NOV-NEXT: fcvt.w.s a2, fa2, rtz
-; CHECK-NOV-NEXT: bge a1, a4, .LBB14_6
+; CHECK-NOV-NEXT: fcvt.w.s a1, fa0, rtz
+; CHECK-NOV-NEXT: lui a5, 16
+; CHECK-NOV-NEXT: addi a5, a5, -1
+; CHECK-NOV-NEXT: fcvt.w.s a2, fa1, rtz
+; CHECK-NOV-NEXT: bge a1, a5, .LBB14_6
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz
-; CHECK-NOV-NEXT: bge a2, a4, .LBB14_7
+; CHECK-NOV-NEXT: fcvt.w.s a3, fa2, rtz
+; CHECK-NOV-NEXT: bge a2, a5, .LBB14_7
; CHECK-NOV-NEXT: .LBB14_2: # %entry
-; CHECK-NOV-NEXT: fcvt.w.s a5, fa0, rtz
-; CHECK-NOV-NEXT: bge a3, a4, .LBB14_8
+; CHECK-NOV-NEXT: fcvt.w.s a4, fa3, rtz
+; CHECK-NOV-NEXT: bge a3, a5, .LBB14_8
; CHECK-NOV-NEXT: .LBB14_3: # %entry
-; CHECK-NOV-NEXT: blt a5, a4, .LBB14_5
+; CHECK-NOV-NEXT: blt a4, a5, .LBB14_5
; CHECK-NOV-NEXT: .LBB14_4: # %entry
-; CHECK-NOV-NEXT: mv a5, a4
+; CHECK-NOV-NEXT: mv a4, a5
; CHECK-NOV-NEXT: .LBB14_5: # %entry
-; CHECK-NOV-NEXT: sgtz a4, a1
-; CHECK-NOV-NEXT: sgtz a6, a2
-; CHECK-NOV-NEXT: sgtz a7, a3
-; CHECK-NOV-NEXT: sgtz t0, a5
+; CHECK-NOV-NEXT: sgtz a5, a4
+; CHECK-NOV-NEXT: sgtz a6, a3
+; CHECK-NOV-NEXT: sgtz a7, a2
+; CHECK-NOV-NEXT: sgtz t0, a1
; CHECK-NOV-NEXT: neg t0, t0
; CHECK-NOV-NEXT: neg a7, a7
; CHECK-NOV-NEXT: neg a6, a6
-; CHECK-NOV-NEXT: neg a4, a4
-; CHECK-NOV-NEXT: and a5, t0, a5
-; CHECK-NOV-NEXT: and a3, a7, a3
-; CHECK-NOV-NEXT: and a2, a6, a2
-; CHECK-NOV-NEXT: and a1, a4, a1
-; CHECK-NOV-NEXT: sh a5, 0(a0)
-; CHECK-NOV-NEXT: sh a3, 2(a0)
-; CHECK-NOV-NEXT: sh a2, 4(a0)
-; CHECK-NOV-NEXT: sh a1, 6(a0)
+; CHECK-NOV-NEXT: neg a5, a5
+; CHECK-NOV-NEXT: and a1, t0, a1
+; CHECK-NOV-NEXT: and a2, a7, a2
+; CHECK-NOV-NEXT: and a3, a6, a3
+; CHECK-NOV-NEXT: and a4, a5, a4
+; CHECK-NOV-NEXT: sh a1, 0(a0)
+; CHECK-NOV-NEXT: sh a2, 2(a0)
+; CHECK-NOV-NEXT: sh a3, 4(a0)
+; CHECK-NOV-NEXT: sh a4, 6(a0)
; CHECK-NOV-NEXT: ret
; CHECK-NOV-NEXT: .LBB14_6: # %entry
-; CHECK-NOV-NEXT: mv a1, a4
-; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz
-; CHECK-NOV-NEXT: blt a2, a4, .LBB14_2
+; CHECK-NOV-NEXT: mv a1, a5
+; CHECK-NOV-NEXT: fcvt.w.s a3, fa2, rtz
+; CHECK-NOV-NEXT: blt a2, a5, .LBB14_2
; CHECK-NOV-NEXT: .LBB14_7: # %entry
-; CHECK-NOV-NEXT: mv a2, a4
-; CHECK-NOV-NEXT: fcvt.w.s a5, fa0, rtz
-; CHECK-NOV-NEXT: blt a3, a4, .LBB14_3
+; CHECK-NOV-NEXT: mv a2, a5
+; CHECK-NOV-NEXT: fcvt.w.s a4, fa3, rtz
+; CHECK-NOV-NEXT: blt a3, a5, .LBB14_3
; CHECK-NOV-NEXT: .LBB14_8: # %entry
-; CHECK-NOV-NEXT: mv a3, a4
-; CHECK-NOV-NEXT: bge a5, a4, .LBB14_4
+; CHECK-NOV-NEXT: mv a3, a5
+; CHECK-NOV-NEXT: bge a4, a5, .LBB14_4
; CHECK-NOV-NEXT: j .LBB14_5
;
; CHECK-V-LABEL: ustest_f32i16:
@@ -1974,72 +1974,72 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
; CHECK-NOV-NEXT: .cfi_remember_state
; CHECK-NOV-NEXT: lhu s1, 32(a1)
; CHECK-NOV-NEXT: lhu s2, 40(a1)
-; CHECK-NOV-NEXT: lhu a2, 48(a1)
-; CHECK-NOV-NEXT: lhu s3, 56(a1)
-; CHECK-NOV-NEXT: lhu s4, 0(a1)
-; CHECK-NOV-NEXT: lhu s5, 8(a1)
+; CHECK-NOV-NEXT: lhu s3, 48(a1)
+; CHECK-NOV-NEXT: lhu s4, 56(a1)
+; CHECK-NOV-NEXT: lhu s5, 0(a1)
+; CHECK-NOV-NEXT: lhu a2, 8(a1)
; CHECK-NOV-NEXT: lhu s6, 16(a1)
; CHECK-NOV-NEXT: lhu s7, 24(a1)
; CHECK-NOV-NEXT: mv s0, a0
; CHECK-NOV-NEXT: fmv.w.x fa0, a2
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs6, fa0
-; CHECK-NOV-NEXT: fmv.w.x fa0, s2
+; CHECK-NOV-NEXT: fmv.w.x fa0, s6
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs5, fa0
-; CHECK-NOV-NEXT: fmv.w.x fa0, s1
+; CHECK-NOV-NEXT: fmv.w.x fa0, s7
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs4, fa0
-; CHECK-NOV-NEXT: fmv.w.x fa0, s7
+; CHECK-NOV-NEXT: fmv.w.x fa0, s1
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs3, fa0
-; CHECK-NOV-NEXT: fmv.w.x fa0, s6
+; CHECK-NOV-NEXT: fmv.w.x fa0, s2
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs2, fa0
-; CHECK-NOV-NEXT: fmv.w.x fa0, s5
+; CHECK-NOV-NEXT: fmv.w.x fa0, s3
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs1, fa0
; CHECK-NOV-NEXT: fmv.w.x fa0, s4
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs0, fa0
-; CHECK-NOV-NEXT: fmv.w.x fa0, s3
+; CHECK-NOV-NEXT: fmv.w.x fa0, s5
; CHECK-NOV-NEXT: fcvt.l.s s1, fs6, rtz
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-NOV-NEXT: lui a4, 16
-; CHECK-NOV-NEXT: addi a4, a4, -1
-; CHECK-NOV-NEXT: bge a0, a4, .LBB17_10
+; CHECK-NOV-NEXT: lui a5, 16
+; CHECK-NOV-NEXT: addi a5, a5, -1
+; CHECK-NOV-NEXT: bge a0, a5, .LBB17_10
; CHECK-NOV-NEXT: # %bb.1: # %entry
; CHECK-NOV-NEXT: fcvt.l.s a1, fs5, rtz
-; CHECK-NOV-NEXT: bge s1, a4, .LBB17_11
+; CHECK-NOV-NEXT: bge s1, a5, .LBB17_11
; CHECK-NOV-NEXT: .LBB17_2: # %entry
; CHECK-NOV-NEXT: fcvt.l.s a2, fs4, rtz
-; CHECK-NOV-NEXT: bge a1, a4, .LBB17_12
+; CHECK-NOV-NEXT: bge a1, a5, .LBB17_12
; CHECK-NOV-NEXT: .LBB17_3: # %entry
; CHECK-NOV-NEXT: fcvt.l.s a3, fs3, rtz
-; CHECK-NOV-NEXT: bge a2, a4, .LBB17_13
+; CHECK-NOV-NEXT: bge a2, a5, .LBB17_13
; CHECK-NOV-NEXT: .LBB17_4: # %entry
-; CHECK-NOV-NEXT: fcvt.l.s a5, fs2, rtz
-; CHECK-NOV-NEXT: bge a3, a4, .LBB17_14
+; CHECK-NOV-NEXT: fcvt.l.s a4, fs2, rtz
+; CHECK-NOV-NEXT: bge a3, a5, .LBB17_14
; CHECK-NOV-NEXT: .LBB17_5: # %entry
; CHECK-NOV-NEXT: fcvt.l.s a6, fs1, rtz
-; CHECK-NOV-NEXT: bge a5, a4, .LBB17_15
+; CHECK-NOV-NEXT: bge a4, a5, .LBB17_15
; CHECK-NOV-NEXT: .LBB17_6: # %entry
; CHECK-NOV-NEXT: fcvt.l.s a7, fs0, rtz
-; CHECK-NOV-NEXT: bge a6, a4, .LBB17_16
+; CHECK-NOV-NEXT: bge a6, a5, .LBB17_16
; CHECK-NOV-NEXT: .LBB17_7: # %entry
-; CHECK-NOV-NEXT: blt a7, a4, .LBB17_9
+; CHECK-NOV-NEXT: blt a7, a5, .LBB17_9
; CHECK-NOV-NEXT: .LBB17_8: # %entry
-; CHECK-NOV-NEXT: mv a7, a4
+; CHECK-NOV-NEXT: mv a7, a5
; CHECK-NOV-NEXT: .LBB17_9: # %entry
-; CHECK-NOV-NEXT: sgtz a4, a0
-; CHECK-NOV-NEXT: sgtz t0, s1
-; CHECK-NOV-NEXT: sgtz t1, a1
-; CHECK-NOV-NEXT: sgtz t2, a2
-; CHECK-NOV-NEXT: sgtz t3, a3
-; CHECK-NOV-NEXT: sgtz t4, a5
-; CHECK-NOV-NEXT: sgtz t5, a6
-; CHECK-NOV-NEXT: sgtz t6, a7
+; CHECK-NOV-NEXT: sgtz a5, a7
+; CHECK-NOV-NEXT: sgtz t0, a6
+; CHECK-NOV-NEXT: sgtz t1, a4
+; CHECK-NOV-NEXT: sgtz t2, a3
+; CHECK-NOV-NEXT: sgtz t3, a2
+; CHECK-NOV-NEXT: sgtz t4, a1
+; CHECK-NOV-NEXT: sgtz t5, s1
+; CHECK-NOV-NEXT: sgtz t6, a0
; CHECK-NOV-NEXT: neg t6, t6
; CHECK-NOV-NEXT: neg t5, t5
; CHECK-NOV-NEXT: neg t4, t4
@@ -2047,23 +2047,23 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
; CHECK-NOV-NEXT: neg t2, t2
; CHECK-NOV-NEXT: neg t1, t1
; CHECK-NOV-NEXT: neg t0, t0
-; CHECK-NOV-NEXT: neg a4, a4
-; CHECK-NOV-NEXT: and a7, t6, a7
-; CHECK-NOV-NEXT: and a6, t5, a6
-; CHECK-NOV-NEXT: and a5, t4, a5
-; CHECK-NOV-NEXT: and a3, t3, a3
-; CHECK-NOV-NEXT: and a2, t2, a2
-; CHECK-NOV-NEXT: and a1, t1, a1
-; CHECK-NOV-NEXT: and t0, t0, s1
-; CHECK-NOV-NEXT: and a0, a4, a0
-; CHECK-NOV-NEXT: sh a2, 8(s0)
-; CHECK-NOV-NEXT: sh a1, 10(s0)
-; CHECK-NOV-NEXT: sh t0, 12(s0)
-; CHECK-NOV-NEXT: sh a0, 14(s0)
-; CHECK-NOV-NEXT: sh a7, 0(s0)
-; CHECK-NOV-NEXT: sh a6, 2(s0)
-; CHECK-NOV-NEXT: sh a5, 4(s0)
-; CHECK-NOV-NEXT: sh a3, 6(s0)
+; CHECK-NOV-NEXT: neg a5, a5
+; CHECK-NOV-NEXT: and a0, t6, a0
+; CHECK-NOV-NEXT: and t5, t5, s1
+; CHECK-NOV-NEXT: and a1, t4, a1
+; CHECK-NOV-NEXT: and a2, t3, a2
+; CHECK-NOV-NEXT: and a3, t2, a3
+; CHECK-NOV-NEXT: and a4, t1, a4
+; CHECK-NOV-NEXT: and a6, t0, a6
+; CHECK-NOV-NEXT: and a5, a5, a7
+; CHECK-NOV-NEXT: sh a3, 8(s0)
+; CHECK-NOV-NEXT: sh a4, 10(s0)
+; CHECK-NOV-NEXT: sh a6, 12(s0)
+; CHECK-NOV-NEXT: sh a5, 14(s0)
+; CHECK-NOV-NEXT: sh a0, 0(s0)
+; CHECK-NOV-NEXT: sh t5, 2(s0)
+; CHECK-NOV-NEXT: sh a1, 4(s0)
+; CHECK-NOV-NEXT: sh a2, 6(s0)
; CHECK-NOV-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s1, 104(sp) # 8-byte Folded Reload
@@ -2101,32 +2101,32 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
; CHECK-NOV-NEXT: ret
; CHECK-NOV-NEXT: .LBB17_10: # %entry
; CHECK-NOV-NEXT: .cfi_restore_state
-; CHECK-NOV-NEXT: mv a0, a4
+; CHECK-NOV-NEXT: mv a0, a5
; CHECK-NOV-NEXT: fcvt.l.s a1, fs5, rtz
-; CHECK-NOV-NEXT: blt s1, a4, .LBB17_2
+; CHECK-NOV-NEXT: blt s1, a5, .LBB17_2
; CHECK-NOV-NEXT: .LBB17_11: # %entry
-; CHECK-NOV-NEXT: mv s1, a4
+; CHECK-NOV-NEXT: mv s1, a5
; CHECK-NOV-NEXT: fcvt.l.s a2, fs4, rtz
-; CHECK-NOV-NEXT: blt a1, a4, .LBB17_3
+; CHECK-NOV-NEXT: blt a1, a5, .LBB17_3
; CHECK-NOV-NEXT: .LBB17_12: # %entry
-; CHECK-NOV-NEXT: mv a1, a4
+; CHECK-NOV-NEXT: mv a1, a5
; CHECK-NOV-NEXT: fcvt.l.s a3, fs3, rtz
-; CHECK-NOV-NEXT: blt a2, a4, .LBB17_4
+; CHECK-NOV-NEXT: blt a2, a5, .LBB17_4
; CHECK-NOV-NEXT: .LBB17_13: # %entry
-; CHECK-NOV-NEXT: mv a2, a4
-; CHECK-NOV-NEXT: fcvt.l.s a5, fs2, rtz
-; CHECK-NOV-NEXT: blt a3, a4, .LBB17_5
+; CHECK-NOV-NEXT: mv a2, a5
+; CHECK-NOV-NEXT: fcvt.l.s a4, fs2, rtz
+; CHECK-NOV-NEXT: blt a3, a5, .LBB17_5
; CHECK-NOV-NEXT: .LBB17_14: # %entry
-; CHECK-NOV-NEXT: mv a3, a4
+; CHECK-NOV-NEXT: mv a3, a5
; CHECK-NOV-NEXT: fcvt.l.s a6, fs1, rtz
-; CHECK-NOV-NEXT: blt a5, a4, .LBB17_6
+; CHECK-NOV-NEXT: blt a4, a5, .LBB17_6
; CHECK-NOV-NEXT: .LBB17_15: # %entry
-; CHECK-NOV-NEXT: mv a5, a4
+; CHECK-NOV-NEXT: mv a4, a5
; CHECK-NOV-NEXT: fcvt.l.s a7, fs0, rtz
-; CHECK-NOV-NEXT: blt a6, a4, .LBB17_7
+; CHECK-NOV-NEXT: blt a6, a5, .LBB17_7
; CHECK-NOV-NEXT: .LBB17_16: # %entry
-; CHECK-NOV-NEXT: mv a6, a4
-; CHECK-NOV-NEXT: bge a7, a4, .LBB17_8
+; CHECK-NOV-NEXT: mv a6, a5
+; CHECK-NOV-NEXT: bge a7, a5, .LBB17_8
; CHECK-NOV-NEXT: j .LBB17_9
;
; CHECK-V-LABEL: ustest_f16i16:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
index d0b184bd853ee..afe918bd66648 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
@@ -13,22 +13,22 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) {
; RV32: # %bb.0:
; RV32-NEXT: lw a0, 0(a0)
; RV32-NEXT: srli a2, a0, 16
-; RV32-NEXT: slli a3, a0, 16
-; RV32-NEXT: slli a4, a0, 24
-; RV32-NEXT: slli a5, a0, 8
-; RV32-NEXT: srli a6, a3, 24
-; RV32-NEXT: srai a3, a3, 24
+; RV32-NEXT: srli a3, a0, 8
+; RV32-NEXT: slli a4, a0, 16
+; RV32-NEXT: slli a5, a0, 24
+; RV32-NEXT: slli a6, a0, 8
; RV32-NEXT: srai a4, a4, 24
; RV32-NEXT: srai a5, a5, 24
+; RV32-NEXT: srai a6, a6, 24
+; RV32-NEXT: sgtz a6, a6
; RV32-NEXT: sgtz a5, a5
; RV32-NEXT: sgtz a4, a4
-; RV32-NEXT: sgtz a3, a3
-; RV32-NEXT: neg a3, a3
; RV32-NEXT: neg a4, a4
; RV32-NEXT: neg a5, a5
-; RV32-NEXT: and a3, a3, a6
-; RV32-NEXT: and a0, a4, a0
-; RV32-NEXT: and a2, a5, a2
+; RV32-NEXT: neg a6, a6
+; RV32-NEXT: and a3, a4, a3
+; RV32-NEXT: and a0, a5, a0
+; RV32-NEXT: and a2, a6, a2
; RV32-NEXT: slli a3, a3, 8
; RV32-NEXT: zext.b a0, a0
; RV32-NEXT: or a0, a0, a3
@@ -39,23 +39,23 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) {
; RV64-LABEL: vec3_setcc_crash:
; RV64: # %bb.0:
; RV64-NEXT: lw a0, 0(a0)
-; RV64-NEXT: srliw a2, a0, 16
-; RV64-NEXT: slli a3, a0, 48
-; RV64-NEXT: slli a4, a0, 56
-; RV64-NEXT: slli a5, a0, 40
-; RV64-NEXT: srli a6, a3, 56
-; RV64-NEXT: srai a3, a3, 56
+; RV64-NEXT: srli a2, a0, 16
+; RV64-NEXT: srli a3, a0, 8
+; RV64-NEXT: slli a4, a0, 48
+; RV64-NEXT: slli a5, a0, 56
+; RV64-NEXT: slli a6, a0, 40
; RV64-NEXT: srai a4, a4, 56
; RV64-NEXT: srai a5, a5, 56
+; RV64-NEXT: srai a6, a6, 56
+; RV64-NEXT: sgtz a6, a6
; RV64-NEXT: sgtz a5, a5
; RV64-NEXT: sgtz a4, a4
-; RV64-NEXT: sgtz a3, a3
-; RV64-NEXT: neg a3, a3
; RV64-NEXT: neg a4, a4
; RV64-NEXT: neg a5, a5
-; RV64-NEXT: and a3, a3, a6
-; RV64-NEXT: and a0, a4, a0
-; RV64-NEXT: and a2, a5, a2
+; RV64-NEXT: neg a6, a6
+; RV64-NEXT: and a3, a4, a3
+; RV64-NEXT: and a0, a5, a0
+; RV64-NEXT: and a2, a6, a2
; RV64-NEXT: slli a3, a3, 8
; RV64-NEXT: zext.b a0, a0
; RV64-NEXT: or a0, a0, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
index 5c1e41fb5e628..b83ddce61f44d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
@@ -470,61 +470,61 @@ define <vscale x 16 x i64> @test_vp_splice_nxv16i64(<vscale x 16 x i64> %va, <vs
; CHECK-LABEL: test_vp_splice_nxv16i64:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a4, vlenb
-; CHECK-NEXT: slli a5, a4, 1
-; CHECK-NEXT: addi a5, a5, -1
; CHECK-NEXT: slli a1, a4, 3
-; CHECK-NEXT: mv a7, a2
-; CHECK-NEXT: bltu a2, a5, .LBB22_2
+; CHECK-NEXT: slli a7, a4, 1
+; CHECK-NEXT: addi a7, a7, -1
+; CHECK-NEXT: add a5, a0, a1
+; CHECK-NEXT: mv a6, a2
+; CHECK-NEXT: bltu a2, a7, .LBB22_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a7, a5
+; CHECK-NEXT: mv a6, a7
; CHECK-NEXT: .LBB22_2:
; CHECK-NEXT: addi sp, sp, -80
; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; CHECK-NEXT: addi s0, sp, 80
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: slli a5, a5, 5
-; CHECK-NEXT: sub sp, sp, a5
+; CHECK-NEXT: csrr a7, vlenb
+; CHECK-NEXT: slli a7, a7, 5
+; CHECK-NEXT: sub sp, sp, a7
; CHECK-NEXT: andi sp, sp, -64
-; CHECK-NEXT: add a5, a0, a1
-; CHECK-NEXT: slli a7, a7, 3
+; CHECK-NEXT: vl8re64.v v24, (a5)
+; CHECK-NEXT: slli a5, a6, 3
; CHECK-NEXT: addi a6, sp, 64
-; CHECK-NEXT: mv t0, a2
+; CHECK-NEXT: add a5, a6, a5
+; CHECK-NEXT: mv a7, a2
; CHECK-NEXT: bltu a2, a4, .LBB22_4
; CHECK-NEXT: # %bb.3:
-; CHECK-NEXT: mv t0, a4
+; CHECK-NEXT: mv a7, a4
; CHECK-NEXT: .LBB22_4:
-; CHECK-NEXT: vl8re64.v v24, (a5)
-; CHECK-NEXT: add a5, a6, a7
; CHECK-NEXT: vl8re64.v v0, (a0)
-; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a7, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a6)
; CHECK-NEXT: sub a0, a2, a4
+; CHECK-NEXT: add a6, a6, a1
+; CHECK-NEXT: sub a7, a3, a4
; CHECK-NEXT: sltu a2, a2, a0
; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a0, a2, a0
-; CHECK-NEXT: add a6, a6, a1
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: and a2, a2, a0
+; CHECK-NEXT: sltu a0, a3, a7
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: and a0, a0, a7
+; CHECK-NEXT: add a7, a5, a1
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v16, (a6)
-; CHECK-NEXT: mv a0, a3
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vse64.v v24, (a7)
; CHECK-NEXT: bltu a3, a4, .LBB22_6
; CHECK-NEXT: # %bb.5:
-; CHECK-NEXT: mv a0, a4
+; CHECK-NEXT: mv a3, a4
; CHECK-NEXT: .LBB22_6:
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v0, (a5)
-; CHECK-NEXT: sub a2, a3, a4
-; CHECK-NEXT: add a5, a5, a1
-; CHECK-NEXT: sltu a3, a3, a2
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a2, a3, a2
-; CHECK-NEXT: addi a3, sp, 104
-; CHECK-NEXT: add a1, a3, a1
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vse64.v v24, (a5)
-; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: addi a2, sp, 104
+; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a3)
+; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v8, (a2)
; CHECK-NEXT: addi sp, s0, -80
; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
@@ -537,66 +537,66 @@ define <vscale x 16 x i64> @test_vp_splice_nxv16i64(<vscale x 16 x i64> %va, <vs
define <vscale x 16 x i64> @test_vp_splice_nxv16i64_negative_offset(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) #0 {
; CHECK-LABEL: test_vp_splice_nxv16i64_negative_offset:
; CHECK: # %bb.0:
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: slli a6, a5, 1
-; CHECK-NEXT: addi a6, a6, -1
-; CHECK-NEXT: slli a1, a5, 3
-; CHECK-NEXT: mv a4, a2
-; CHECK-NEXT: bltu a2, a6, .LBB23_2
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a1, a4, 3
+; CHECK-NEXT: slli a7, a4, 1
+; CHECK-NEXT: addi a7, a7, -1
+; CHECK-NEXT: add a5, a0, a1
+; CHECK-NEXT: mv a6, a2
+; CHECK-NEXT: bltu a2, a7, .LBB23_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a4, a6
+; CHECK-NEXT: mv a6, a7
; CHECK-NEXT: .LBB23_2:
; CHECK-NEXT: addi sp, sp, -80
; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; CHECK-NEXT: addi s0, sp, 80
-; CHECK-NEXT: csrr a6, vlenb
-; CHECK-NEXT: slli a6, a6, 5
-; CHECK-NEXT: sub sp, sp, a6
+; CHECK-NEXT: csrr a7, vlenb
+; CHECK-NEXT: slli a7, a7, 5
+; CHECK-NEXT: sub sp, sp, a7
; CHECK-NEXT: andi sp, sp, -64
-; CHECK-NEXT: add a6, a0, a1
-; CHECK-NEXT: slli a4, a4, 3
+; CHECK-NEXT: vl8re64.v v24, (a5)
+; CHECK-NEXT: slli a5, a6, 3
; CHECK-NEXT: addi a7, sp, 64
+; CHECK-NEXT: add a6, a7, a5
; CHECK-NEXT: mv t0, a2
-; CHECK-NEXT: bltu a2, a5, .LBB23_4
+; CHECK-NEXT: bltu a2, a4, .LBB23_4
; CHECK-NEXT: # %bb.3:
-; CHECK-NEXT: mv t0, a5
+; CHECK-NEXT: mv t0, a4
; CHECK-NEXT: .LBB23_4:
-; CHECK-NEXT: vl8re64.v v24, (a6)
-; CHECK-NEXT: add a6, a7, a4
; CHECK-NEXT: vl8re64.v v0, (a0)
; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a7)
-; CHECK-NEXT: sub a0, a2, a5
+; CHECK-NEXT: sub a0, a2, a4
+; CHECK-NEXT: add a7, a7, a1
+; CHECK-NEXT: sub t0, a3, a4
; CHECK-NEXT: sltu a2, a2, a0
; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a0, a2, a0
-; CHECK-NEXT: add a7, a7, a1
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: and a2, a2, a0
+; CHECK-NEXT: sltu a0, a3, t0
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: and a0, a0, t0
+; CHECK-NEXT: add t0, a6, a1
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v16, (a7)
-; CHECK-NEXT: mv a0, a3
-; CHECK-NEXT: bltu a3, a5, .LBB23_6
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vse64.v v24, (t0)
+; CHECK-NEXT: bltu a3, a4, .LBB23_6
; CHECK-NEXT: # %bb.5:
-; CHECK-NEXT: mv a0, a5
+; CHECK-NEXT: mv a3, a4
; CHECK-NEXT: .LBB23_6:
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: li a2, 8
+; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v0, (a6)
-; CHECK-NEXT: sub a2, a3, a5
-; CHECK-NEXT: add a5, a6, a1
-; CHECK-NEXT: sltu a3, a3, a2
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a2, a3, a2
-; CHECK-NEXT: li a3, 8
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vse64.v v24, (a5)
-; CHECK-NEXT: bltu a4, a3, .LBB23_8
+; CHECK-NEXT: bltu a5, a2, .LBB23_8
; CHECK-NEXT: # %bb.7:
-; CHECK-NEXT: li a4, 8
+; CHECK-NEXT: li a5, 8
; CHECK-NEXT: .LBB23_8:
-; CHECK-NEXT: sub a2, a6, a4
+; CHECK-NEXT: sub a2, a6, a5
; CHECK-NEXT: add a1, a2, a1
-; CHECK-NEXT: vle64.v v16, (a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a2)
; CHECK-NEXT: addi sp, s0, -80
; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/VE/Scalar/min.ll b/llvm/test/CodeGen/VE/Scalar/min.ll
index 69d5ce48601f8..e8f4939f9149e 100644
--- a/llvm/test/CodeGen/VE/Scalar/min.ll
+++ b/llvm/test/CodeGen/VE/Scalar/min.ll
@@ -278,18 +278,18 @@ define i32 @min2u32(i32, i32) {
define zeroext i1 @mini1(i1 zeroext, i1 zeroext) {
; CHECK-LABEL: mini1:
; CHECK: # %bb.0:
-; CHECK-NEXT: and %s0, %s0, (32)0
-; CHECK-NEXT: and %s2, %s1, %s0
-; CHECK-NEXT: cmov.w.ne %s2, %s1, %s0
-; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1
+; CHECK-NEXT: and %s2, 1, %s0
+; CHECK-NEXT: and %s0, %s1, %s0
+; CHECK-NEXT: cmov.w.ne %s0, %s1, %s2
+; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1
; CHECK-NEXT: b.l.t (, %s10)
;
; OPT-LABEL: mini1:
; OPT: # %bb.0:
-; OPT-NEXT: and %s0, %s0, (32)0
-; OPT-NEXT: and %s2, %s1, %s0
-; OPT-NEXT: cmov.w.ne %s2, %s1, %s0
-; OPT-NEXT: adds.w.zx %s0, %s2, (0)1
+; OPT-NEXT: and %s2, 1, %s0
+; OPT-NEXT: and %s0, %s1, %s0
+; OPT-NEXT: cmov.w.ne %s0, %s1, %s2
+; OPT-NEXT: adds.w.zx %s0, %s0, (0)1
; OPT-NEXT: b.l.t (, %s10)
%3 = xor i1 %0, true
%4 = and i1 %3, %1
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 1ae1d61091362..98187d61c1f84 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -2201,9 +2201,9 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; SSE41-NEXT: psraw $8, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psllw $7, %xmm3
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5],xmm3[6],xmm0[7]
+; SSE41-NEXT: paddw %xmm0, %xmm3
+; SSE41-NEXT: psllw $7, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE41-NEXT: psraw $8, %xmm2
@@ -2234,9 +2234,9 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpsllw $7, %xmm2, %xmm3
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6],xmm2[7]
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
+; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index 189de051011d2..962ffe47d0d51 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -490,18 +490,19 @@ define i32 @freeze_ashr_exact(i32 %a0) nounwind {
define i32 @freeze_ashr_exact_extra_use(i32 %a0, ptr %escape) nounwind {
; X86-LABEL: freeze_ashr_exact_extra_use:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: sarl $3, %eax
-; X86-NEXT: movl %eax, (%ecx)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: sarl $3, %ecx
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: sarl $6, %eax
; X86-NEXT: retl
;
; X64-LABEL: freeze_ashr_exact_extra_use:
; X64: # %bb.0:
+; X64-NEXT: sarl $3, %edi
+; X64-NEXT: movl %edi, (%rsi)
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: sarl $3, %eax
-; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: sarl $6, %eax
; X64-NEXT: retq
%x = ashr exact i32 %a0, 3
@@ -603,18 +604,19 @@ define i32 @freeze_lshr_exact(i32 %a0) nounwind {
define i32 @freeze_lshr_exact_extra_use(i32 %a0, ptr %escape) nounwind {
; X86-LABEL: freeze_lshr_exact_extra_use:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: movl %eax, (%ecx)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrl $5, %eax
; X86-NEXT: retl
;
; X64-LABEL: freeze_lshr_exact_extra_use:
; X64: # %bb.0:
+; X64-NEXT: shrl $3, %edi
+; X64-NEXT: movl %edi, (%rsi)
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrl $3, %eax
-; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: shrl $5, %eax
; X64-NEXT: retq
%x = lshr exact i32 %a0, 3
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index 953a5e7285fe4..15b43c41b9945 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -600,8 +600,8 @@ define void @freeze_buildvector_extrause(ptr %origin0, ptr %origin1, ptr %origin
; X86-NEXT: vpinsrd $1, (%edi), %xmm0, %xmm0
; X86-NEXT: vpinsrd $2, (%esi), %xmm0, %xmm0
; X86-NEXT: vpinsrd $3, (%edx), %xmm0, %xmm0
-; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-NEXT: vmovdqa %xmm0, (%ecx)
+; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; X86-NEXT: vmovdqa %xmm1, (%ecx)
; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%eax)
; X86-NEXT: popl %esi
@@ -616,8 +616,8 @@ define void @freeze_buildvector_extrause(ptr %origin0, ptr %origin1, ptr %origin
; X64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0
; X64-NEXT: vpinsrd $3, (%rcx), %xmm0, %xmm0
; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
-; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
-; X64-NEXT: vmovdqa %xmm0, (%r9)
+; X64-NEXT: vpand %xmm1, %xmm0, %xmm1
+; X64-NEXT: vmovdqa %xmm1, (%r9)
; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%r8)
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index 3a4a638c7330a..fb2433dbbb1e1 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -730,36 +730,36 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa (%rdi), %xmm2
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4
; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5
; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6
; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7
; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9
+; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm9
; AVX1-NEXT: vpmuludq %xmm0, %xmm9, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5
-; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpmuludq %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm5
+; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm4
; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm7
+; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm7
; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4
-; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5
+; AVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm4
-; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm5
+; AVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -767,20 +767,20 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
; AVX2-LABEL: vec256_i64_signed_mem_reg:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm3
+; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpsubq %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm4
; AVX2-NEXT: vpsrlq $33, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
-; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2
+; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@@ -790,36 +790,36 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOP-NEXT: vmovdqa (%rdi), %xmm2
; XOP-NEXT: vmovdqa 16(%rdi), %xmm3
-; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm4
-; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm5
+; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm4
; XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0
-; XOP-NEXT: vpxor %xmm5, %xmm0, %xmm0
-; XOP-NEXT: vpsubq %xmm0, %xmm5, %xmm0
+; XOP-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; XOP-NEXT: vpsubq %xmm0, %xmm4, %xmm0
+; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm5
; XOP-NEXT: vpsubq %xmm1, %xmm3, %xmm1
-; XOP-NEXT: vpxor %xmm4, %xmm1, %xmm1
-; XOP-NEXT: vpsubq %xmm1, %xmm4, %xmm1
+; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1
+; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1
; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6
; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7
; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0
; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9
+; XOP-NEXT: vpor %xmm4, %xmm8, %xmm9
; XOP-NEXT: vpmuludq %xmm0, %xmm9, %xmm0
-; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5
-; XOP-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
-; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0
+; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4
+; XOP-NEXT: vpmuludq %xmm4, %xmm7, %xmm4
+; XOP-NEXT: vpaddq %xmm0, %xmm4, %xmm0
; XOP-NEXT: vpsllq $32, %xmm0, %xmm0
-; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5
+; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm4
; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1
-; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7
+; XOP-NEXT: vpor %xmm5, %xmm8, %xmm7
; XOP-NEXT: vpmuludq %xmm7, %xmm1, %xmm1
-; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4
-; XOP-NEXT: vpmuludq %xmm4, %xmm6, %xmm4
-; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1
+; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5
+; XOP-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
+; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1
; XOP-NEXT: vpsllq $32, %xmm1, %xmm1
-; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4
-; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3
+; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm5
+; XOP-NEXT: vpaddq %xmm3, %xmm5, %xmm3
; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1
-; XOP-NEXT: vpaddq %xmm2, %xmm5, %xmm2
+; XOP-NEXT: vpaddq %xmm2, %xmm4, %xmm2
; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; XOP-NEXT: retq
@@ -897,101 +897,101 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwind {
; AVX1-LABEL: vec256_i64_signed_reg_mem:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa (%rdi), %xmm2
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6
-; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7
-; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1
-; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9
-; AVX1-NEXT: vpmuludq %xmm1, %xmm9, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5
-; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm5
+; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsubq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm6
+; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm7
; AVX1-NEXT: vpsrlq $33, %xmm2, %xmm2
-; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm7
-; AVX1-NEXT: vpmuludq %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
+; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm9
+; AVX1-NEXT: vpmuludq %xmm2, %xmm9, %xmm2
; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4
-; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpmuludq %xmm4, %xmm7, %xmm4
; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm4
-; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm4
+; AVX1-NEXT: vpsrlq $33, %xmm3, %xmm3
+; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm7
+; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5
+; AVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm5
+; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i64_signed_reg_mem:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm3
+; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm4
; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
-; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2
+; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOP-LABEL: vec256_i64_signed_reg_mem:
; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa (%rdi), %xmm1
-; XOP-NEXT: vmovdqa 16(%rdi), %xmm2
-; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOP-NEXT: vpcomgtq %xmm2, %xmm3, %xmm4
-; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm5
-; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1
-; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1
-; XOP-NEXT: vpsubq %xmm2, %xmm3, %xmm2
+; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
+; XOP-NEXT: vmovdqa (%rdi), %xmm2
+; XOP-NEXT: vmovdqa 16(%rdi), %xmm3
+; XOP-NEXT: vpcomgtq %xmm2, %xmm0, %xmm4
+; XOP-NEXT: vpsubq %xmm2, %xmm0, %xmm2
; XOP-NEXT: vpxor %xmm4, %xmm2, %xmm2
; XOP-NEXT: vpsubq %xmm2, %xmm4, %xmm2
-; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6
-; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7
-; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1
-; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9
-; XOP-NEXT: vpmuludq %xmm1, %xmm9, %xmm1
-; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5
-; XOP-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
-; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1
-; XOP-NEXT: vpsllq $32, %xmm1, %xmm1
-; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5
+; XOP-NEXT: vpcomgtq %xmm3, %xmm1, %xmm5
+; XOP-NEXT: vpsubq %xmm3, %xmm1, %xmm3
+; XOP-NEXT: vpxor %xmm5, %xmm3, %xmm3
+; XOP-NEXT: vpsubq %xmm3, %xmm5, %xmm3
+; XOP-NEXT: vpsrlq $1, %xmm3, %xmm6
+; XOP-NEXT: vpsrlq $1, %xmm2, %xmm7
; XOP-NEXT: vpsrlq $33, %xmm2, %xmm2
-; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7
-; XOP-NEXT: vpmuludq %xmm7, %xmm2, %xmm2
+; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
+; XOP-NEXT: vpor %xmm4, %xmm8, %xmm9
+; XOP-NEXT: vpmuludq %xmm2, %xmm9, %xmm2
; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4
-; XOP-NEXT: vpmuludq %xmm4, %xmm6, %xmm4
+; XOP-NEXT: vpmuludq %xmm4, %xmm7, %xmm4
; XOP-NEXT: vpaddq %xmm2, %xmm4, %xmm2
; XOP-NEXT: vpsllq $32, %xmm2, %xmm2
-; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4
-; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3
-; XOP-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0
-; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm4
+; XOP-NEXT: vpsrlq $33, %xmm3, %xmm3
+; XOP-NEXT: vpor %xmm5, %xmm8, %xmm7
+; XOP-NEXT: vpmuludq %xmm7, %xmm3, %xmm3
+; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5
+; XOP-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
+; XOP-NEXT: vpaddq %xmm3, %xmm5, %xmm3
+; XOP-NEXT: vpsllq $32, %xmm3, %xmm3
+; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm5
+; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1
+; XOP-NEXT: vpaddq %xmm3, %xmm1, %xmm1
+; XOP-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; XOP-NEXT: retq
;
; AVX512F-LABEL: vec256_i64_signed_reg_mem:
@@ -1071,36 +1071,36 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
; AVX1-NEXT: vmovdqa (%rdi), %xmm2
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4
; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5
; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6
; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7
; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9
+; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm9
; AVX1-NEXT: vpmuludq %xmm0, %xmm9, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5
-; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpmuludq %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm5
+; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm4
; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm7
+; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm7
; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4
-; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5
+; AVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm4
-; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm5
+; AVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -1109,20 +1109,20 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vmovdqa (%rsi), %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm3
+; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm4
; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
-; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2
+; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -1133,36 +1133,36 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOP-NEXT: vmovdqa 16(%rsi), %xmm1
; XOP-NEXT: vmovdqa (%rdi), %xmm2
; XOP-NEXT: vmovdqa 16(%rdi), %xmm3
-; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm4
-; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm5
+; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm4
; XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0
-; XOP-NEXT: vpxor %xmm5, %xmm0, %xmm0
-; XOP-NEXT: vpsubq %xmm0, %xmm5, %xmm0
+; XOP-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; XOP-NEXT: vpsubq %xmm0, %xmm4, %xmm0
+; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm5
; XOP-NEXT: vpsubq %xmm1, %xmm3, %xmm1
-; XOP-NEXT: vpxor %xmm4, %xmm1, %xmm1
-; XOP-NEXT: vpsubq %xmm1, %xmm4, %xmm1
+; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1
+; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1
; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6
; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7
; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0
; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9
+; XOP-NEXT: vpor %xmm4, %xmm8, %xmm9
; XOP-NEXT: vpmuludq %xmm0, %xmm9, %xmm0
-; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5
-; XOP-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
-; XOP-NEXT: vpaddq %xmm0, %xmm5, %xmm0
+; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4
+; XOP-NEXT: vpmuludq %xmm4, %xmm7, %xmm4
+; XOP-NEXT: vpaddq %xmm0, %xmm4, %xmm0
; XOP-NEXT: vpsllq $32, %xmm0, %xmm0
-; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm5
+; XOP-NEXT: vpmuludq %xmm7, %xmm9, %xmm4
; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1
-; XOP-NEXT: vpor %xmm4, %xmm8, %xmm7
+; XOP-NEXT: vpor %xmm5, %xmm8, %xmm7
; XOP-NEXT: vpmuludq %xmm7, %xmm1, %xmm1
-; XOP-NEXT: vpsrlq $32, %xmm4, %xmm4
-; XOP-NEXT: vpmuludq %xmm4, %xmm6, %xmm4
-; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1
+; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5
+; XOP-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
+; XOP-NEXT: vpaddq %xmm1, %xmm5, %xmm1
; XOP-NEXT: vpsllq $32, %xmm1, %xmm1
-; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm4
-; XOP-NEXT: vpaddq %xmm3, %xmm4, %xmm3
+; XOP-NEXT: vpmuludq %xmm7, %xmm6, %xmm5
+; XOP-NEXT: vpaddq %xmm3, %xmm5, %xmm3
; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1
-; XOP-NEXT: vpaddq %xmm2, %xmm5, %xmm2
+; XOP-NEXT: vpaddq %xmm2, %xmm4, %xmm2
; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; XOP-NEXT: retq
@@ -1627,27 +1627,27 @@ define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounw
define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounwind {
; AVX1-LABEL: vec256_i16_signed_reg_mem:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm6
-; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpsubw %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm6
-; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa (%rdi), %xmm2
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpcmpgtw %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm6
+; AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpsubw %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpminsw %xmm3, %xmm1, %xmm6
+; AVX1-NEXT: vpmaxsw %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpsubw %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpmullw %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i16_signed_reg_mem:
@@ -1665,25 +1665,25 @@ define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounw
;
; XOP-LABEL: vec256_i16_signed_reg_mem:
; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa (%rdi), %xmm1
-; XOP-NEXT: vmovdqa 16(%rdi), %xmm2
-; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOP-NEXT: vpcomgtw %xmm2, %xmm3, %xmm4
-; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm5
-; XOP-NEXT: vpminsw %xmm2, %xmm3, %xmm6
-; XOP-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
+; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
+; XOP-NEXT: vmovdqa (%rdi), %xmm2
+; XOP-NEXT: vmovdqa 16(%rdi), %xmm3
+; XOP-NEXT: vpcomgtw %xmm3, %xmm1, %xmm4
+; XOP-NEXT: vpcomgtw %xmm2, %xmm0, %xmm5
+; XOP-NEXT: vpminsw %xmm3, %xmm1, %xmm6
+; XOP-NEXT: vpmaxsw %xmm3, %xmm1, %xmm3
+; XOP-NEXT: vpsubw %xmm6, %xmm3, %xmm3
+; XOP-NEXT: vpminsw %xmm2, %xmm0, %xmm6
+; XOP-NEXT: vpmaxsw %xmm2, %xmm0, %xmm2
; XOP-NEXT: vpsubw %xmm6, %xmm2, %xmm2
-; XOP-NEXT: vpminsw %xmm1, %xmm0, %xmm6
-; XOP-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
-; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1
-; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
; XOP-NEXT: vpsrlw $1, %xmm2, %xmm2
+; XOP-NEXT: vpsrlw $1, %xmm3, %xmm3
; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
-; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm2, %xmm2
-; XOP-NEXT: vpmacsww %xmm0, %xmm5, %xmm1, %xmm0
-; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOP-NEXT: vpmacsww %xmm1, %xmm4, %xmm3, %xmm1
+; XOP-NEXT: vpmacsww %xmm0, %xmm5, %xmm2, %xmm0
+; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; XOP-NEXT: retq
;
; AVX512F-LABEL: vec256_i16_signed_reg_mem:
@@ -2425,9 +2425,9 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind {
; AVX1-LABEL: vec256_i8_signed_reg_mem:
; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa (%rdi), %xmm2
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm5
; AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm6
@@ -2487,38 +2487,38 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
;
; XOP-LABEL: vec256_i8_signed_reg_mem:
; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa (%rdi), %xmm1
-; XOP-NEXT: vmovdqa 16(%rdi), %xmm2
-; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOP-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4
-; XOP-NEXT: vpcomgtb %xmm1, %xmm0, %xmm5
-; XOP-NEXT: vpminsb %xmm1, %xmm0, %xmm6
-; XOP-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1
-; XOP-NEXT: vpsubb %xmm6, %xmm1, %xmm1
-; XOP-NEXT: vpminsb %xmm2, %xmm3, %xmm6
-; XOP-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
+; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
+; XOP-NEXT: vmovdqa (%rdi), %xmm2
+; XOP-NEXT: vmovdqa 16(%rdi), %xmm3
+; XOP-NEXT: vpcomgtb %xmm3, %xmm1, %xmm4
+; XOP-NEXT: vpcomgtb %xmm2, %xmm0, %xmm5
+; XOP-NEXT: vpminsb %xmm2, %xmm0, %xmm6
+; XOP-NEXT: vpmaxsb %xmm2, %xmm0, %xmm2
; XOP-NEXT: vpsubb %xmm6, %xmm2, %xmm2
+; XOP-NEXT: vpminsb %xmm3, %xmm1, %xmm6
+; XOP-NEXT: vpmaxsb %xmm3, %xmm1, %xmm3
+; XOP-NEXT: vpsubb %xmm6, %xmm3, %xmm3
; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
+; XOP-NEXT: vpshlb %xmm6, %xmm3, %xmm3
; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2
-; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1
; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5
; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8
-; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8
+; XOP-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8
; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5
-; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1
+; XOP-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm2
; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30]
-; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1
+; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm2, %xmm2
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6
-; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6
+; XOP-NEXT: vpmaddubsw %xmm6, %xmm3, %xmm6
; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4
-; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2
-; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2
-; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOP-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3
+; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm3, %xmm3
+; XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; XOP-NEXT: vpaddb %xmm0, %xmm2, %xmm0
+; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; XOP-NEXT: retq
;
; AVX512F-LABEL: vec256_i8_signed_reg_mem:
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
index 5f6337e29d685..a4750b4cd4ad0 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
@@ -507,58 +507,58 @@ define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounw
define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounwind {
; AVX512F-LABEL: vec512_i16_signed_reg_mem:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
-; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
+; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm5
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm5
-; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm5
+; AVX512F-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3
+; AVX512F-NEXT: vpsubw %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm5
+; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5
-; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5
; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3
; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
-; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5))
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_mem:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
+; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5
; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm3, %ymm6, %ymm3
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5))
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i16_signed_reg_mem:
@@ -939,66 +939,66 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind {
; AVX512F-LABEL: vec512_i8_signed_reg_mem:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
-; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
+; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5
-; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm5
+; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3
+; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm5
+; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5
-; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5
; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
+; AVX512F-NEXT: vpsubb %ymm3, %ymm7, %ymm3
+; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5))
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
+; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5
; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm7, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5))
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i8_signed_reg_mem:
diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll
index 1921cf383b2f2..a75d42ed0c50f 100644
--- a/llvm/test/CodeGen/X86/midpoint-int.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int.ll
@@ -28,24 +28,27 @@ define i32 @scalar_i32_signed_reg_reg(i32 %a1, i32 %a2) nounwind {
;
; X86-LABEL: scalar_i32_signed_reg_reg:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpl %esi, %ecx
-; X86-NEXT: setle %al
-; X86-NEXT: leal -1(%eax,%eax), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: subl %esi, %edx
+; X86-NEXT: xorl %ebx, %ebx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: subl %esi, %eax
+; X86-NEXT: setle %bl
+; X86-NEXT: leal -1(%ebx,%ebx), %esi
; X86-NEXT: jg .LBB0_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: negl %edx
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: .LBB0_2:
; X86-NEXT: shrl %eax
-; X86-NEXT: imull %edx, %eax
+; X86-NEXT: imull %esi, %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
%t3 = icmp sgt i32 %a1, %a2 ; signed
%t4 = select i1 %t3, i32 -1, i32 1
@@ -76,26 +79,27 @@ define i32 @scalar_i32_unsigned_reg_reg(i32 %a1, i32 %a2) nounwind {
;
; X86-LABEL: scalar_i32_unsigned_reg_reg:
; X86: # %bb.0:
-; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: movl %edi, %esi
-; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: setae %al
-; X86-NEXT: leal -1(%eax,%eax), %edx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: subl %esi, %edx
+; X86-NEXT: xorl %ebx, %ebx
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: subl %edi, %eax
+; X86-NEXT: subl %esi, %eax
+; X86-NEXT: setbe %bl
+; X86-NEXT: leal -1(%ebx,%ebx), %esi
; X86-NEXT: ja .LBB1_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: negl %edx
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: .LBB1_2:
; X86-NEXT: shrl %eax
-; X86-NEXT: imull %edx, %eax
+; X86-NEXT: imull %esi, %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
%t3 = icmp ugt i32 %a1, %a2
%t4 = select i1 %t3, i32 -1, i32 1
@@ -128,25 +132,28 @@ define i32 @scalar_i32_signed_mem_reg(ptr %a1_addr, i32 %a2) nounwind {
;
; X86-LABEL: scalar_i32_signed_mem_reg:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl (%eax), %ecx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpl %edx, %ecx
-; X86-NEXT: setle %al
-; X86-NEXT: leal -1(%eax,%eax), %esi
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: subl %esi, %edx
+; X86-NEXT: xorl %ebx, %ebx
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: subl %edx, %eax
+; X86-NEXT: subl %esi, %eax
+; X86-NEXT: setle %bl
+; X86-NEXT: leal -1(%ebx,%ebx), %esi
; X86-NEXT: jg .LBB2_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: negl %edx
; X86-NEXT: movl %edx, %eax
; X86-NEXT: .LBB2_2:
; X86-NEXT: shrl %eax
; X86-NEXT: imull %esi, %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
%a1 = load i32, ptr %a1_addr
%t3 = icmp sgt i32 %a1, %a2 ; signed
@@ -178,25 +185,28 @@ define i32 @scalar_i32_signed_reg_mem(i32 %a1, ptr %a2_addr) nounwind {
;
; X86-LABEL: scalar_i32_signed_reg_mem:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl (%eax), %esi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpl %esi, %ecx
-; X86-NEXT: setle %al
-; X86-NEXT: leal -1(%eax,%eax), %edx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: subl %esi, %edx
+; X86-NEXT: xorl %ebx, %ebx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: subl %esi, %eax
+; X86-NEXT: setle %bl
+; X86-NEXT: leal -1(%ebx,%ebx), %esi
; X86-NEXT: jg .LBB3_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: negl %edx
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: .LBB3_2:
; X86-NEXT: shrl %eax
-; X86-NEXT: imull %edx, %eax
+; X86-NEXT: imull %esi, %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
%a2 = load i32, ptr %a2_addr
%t3 = icmp sgt i32 %a1, %a2 ; signed
@@ -229,26 +239,29 @@ define i32 @scalar_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
;
; X86-LABEL: scalar_i32_signed_mem_mem:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %ecx
; X86-NEXT: movl (%eax), %esi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpl %esi, %ecx
-; X86-NEXT: setle %al
-; X86-NEXT: leal -1(%eax,%eax), %edx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: subl %esi, %edx
+; X86-NEXT: xorl %ebx, %ebx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: subl %esi, %eax
+; X86-NEXT: setle %bl
+; X86-NEXT: leal -1(%ebx,%ebx), %esi
; X86-NEXT: jg .LBB4_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: negl %edx
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: .LBB4_2:
; X86-NEXT: shrl %eax
-; X86-NEXT: imull %edx, %eax
+; X86-NEXT: imull %esi, %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
%a1 = load i32, ptr %a1_addr
%a2 = load i32, ptr %a2_addr
@@ -291,36 +304,34 @@ define i64 @scalar_i64_signed_reg_reg(i64 %a1, i64 %a2) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: cmpl %esi, %edx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: setl %al
-; X86-NEXT: movzbl %al, %edi
-; X86-NEXT: negl %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: orl $1, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %esi, %eax
; X86-NEXT: subl %edx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: sbbl %ecx, %ebp
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: sbbl %ebp, %edi
; X86-NEXT: subl %esi, %edx
-; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: sbbl %ecx, %ebp
+; X86-NEXT: setl %bl
+; X86-NEXT: movzbl %bl, %ebx
; X86-NEXT: jl .LBB5_2
; X86-NEXT: # %bb.1:
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: movl %ebp, %edi
; X86-NEXT: .LBB5_2:
-; X86-NEXT: shrdl $1, %ebp, %eax
-; X86-NEXT: shrl %ebp
-; X86-NEXT: imull %eax, %edi
-; X86-NEXT: mull %ebx
-; X86-NEXT: addl %edi, %edx
+; X86-NEXT: negl %ebx
+; X86-NEXT: shrdl $1, %edi, %eax
+; X86-NEXT: shrl %edi
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: imull %ebx, %ebp
+; X86-NEXT: orl $1, %ebx
+; X86-NEXT: mull %ebx
; X86-NEXT: addl %ebp, %edx
-; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: imull %edi, %ebx
+; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -359,10 +370,10 @@ define i64 @scalar_i64_unsigned_reg_reg(i64 %a1, i64 %a2) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: cmpl %eax, %ebp
; X86-NEXT: sbbl %ecx, %esi
@@ -429,45 +440,36 @@ define i64 @scalar_i64_signed_mem_reg(ptr %a1_addr, i64 %a2) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $12, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %ebx
-; X86-NEXT: movl 4(%eax), %esi
-; X86-NEXT: cmpl %ebx, %edx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: sbbl %esi, %eax
-; X86-NEXT: setl %al
-; X86-NEXT: movzbl %al, %edi
-; X86-NEXT: negl %edi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: orl $1, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl (%eax), %esi
+; X86-NEXT: movl 4(%eax), %ecx
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: subl %edx, %eax
-; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: sbbl %ebp, %edi
+; X86-NEXT: subl %esi, %edx
; X86-NEXT: sbbl %ecx, %ebp
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: subl %ebx, %edx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: setl %bl
+; X86-NEXT: movzbl %bl, %ebx
; X86-NEXT: jl .LBB7_2
; X86-NEXT: # %bb.1:
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: movl %ebp, %edi
; X86-NEXT: .LBB7_2:
-; X86-NEXT: shrdl $1, %ebp, %eax
-; X86-NEXT: shrl %ebp
-; X86-NEXT: imull %eax, %edi
-; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT: mull %ecx
-; X86-NEXT: addl %edi, %edx
-; X86-NEXT: imull %ecx, %ebp
+; X86-NEXT: negl %ebx
+; X86-NEXT: shrdl $1, %edi, %eax
+; X86-NEXT: shrl %edi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: imull %ebx, %ebp
+; X86-NEXT: orl $1, %ebx
+; X86-NEXT: mull %ebx
; X86-NEXT: addl %ebp, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: addl $12, %esp
+; X86-NEXT: imull %edi, %ebx
+; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -508,37 +510,35 @@ define i64 @scalar_i64_signed_reg_mem(i64 %a1, ptr %a2_addr) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: movl 4(%eax), %ecx
-; X86-NEXT: cmpl %esi, %edx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: setl %al
-; X86-NEXT: movzbl %al, %edi
-; X86-NEXT: negl %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: orl $1, %ebx
+; X86-NEXT: movl 4(%eax), %ebp
; X86-NEXT: movl %esi, %eax
; X86-NEXT: subl %edx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: sbbl %ecx, %ebp
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: sbbl %ebp, %edi
; X86-NEXT: subl %esi, %edx
-; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: sbbl %ecx, %ebp
+; X86-NEXT: setl %bl
+; X86-NEXT: movzbl %bl, %ebx
; X86-NEXT: jl .LBB8_2
; X86-NEXT: # %bb.1:
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: movl %ebp, %edi
; X86-NEXT: .LBB8_2:
-; X86-NEXT: shrdl $1, %ebp, %eax
-; X86-NEXT: shrl %ebp
-; X86-NEXT: imull %eax, %edi
-; X86-NEXT: mull %ebx
-; X86-NEXT: addl %edi, %edx
+; X86-NEXT: negl %ebx
+; X86-NEXT: shrdl $1, %edi, %eax
+; X86-NEXT: shrl %edi
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: imull %ebx, %ebp
+; X86-NEXT: orl $1, %ebx
+; X86-NEXT: mull %ebx
; X86-NEXT: addl %ebp, %edx
-; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: imull %edi, %ebx
+; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -579,46 +579,37 @@ define i64 @scalar_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $12, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl (%ecx), %ebx
-; X86-NEXT: movl 4(%ecx), %esi
+; X86-NEXT: movl (%ecx), %esi
+; X86-NEXT: movl 4(%ecx), %ecx
; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: movl 4(%eax), %ecx
-; X86-NEXT: cmpl %ebx, %edx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: sbbl %esi, %eax
-; X86-NEXT: setl %al
-; X86-NEXT: movzbl %al, %edi
-; X86-NEXT: negl %edi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: orl $1, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl 4(%eax), %ebp
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: subl %edx, %eax
-; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: sbbl %ebp, %edi
+; X86-NEXT: subl %esi, %edx
; X86-NEXT: sbbl %ecx, %ebp
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: subl %ebx, %edx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: setl %bl
+; X86-NEXT: movzbl %bl, %ebx
; X86-NEXT: jl .LBB9_2
; X86-NEXT: # %bb.1:
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: movl %ebp, %edi
; X86-NEXT: .LBB9_2:
-; X86-NEXT: shrdl $1, %ebp, %eax
-; X86-NEXT: shrl %ebp
-; X86-NEXT: imull %eax, %edi
-; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT: mull %ecx
-; X86-NEXT: addl %edi, %edx
-; X86-NEXT: imull %ecx, %ebp
+; X86-NEXT: negl %ebx
+; X86-NEXT: shrdl $1, %edi, %eax
+; X86-NEXT: shrl %edi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: imull %ebx, %ebp
+; X86-NEXT: orl $1, %ebx
+; X86-NEXT: mull %ebx
; X86-NEXT: addl %ebp, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: addl $12, %esp
+; X86-NEXT: imull %edi, %ebx
+; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -667,17 +658,16 @@ define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind {
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: xorl %ebx, %ebx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: subw %dx, %ax
+; X86-NEXT: setle %bl
+; X86-NEXT: leal -1(%ebx,%ebx), %edx
; X86-NEXT: jg .LBB10_2
; X86-NEXT: # %bb.1:
; X86-NEXT: negl %eax
; X86-NEXT: .LBB10_2:
-; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: cmpw %dx, %cx
-; X86-NEXT: setle %bl
-; X86-NEXT: leal -1(%ebx,%ebx), %edx
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: shrl %eax
; X86-NEXT: imull %edx, %eax
@@ -720,17 +710,16 @@ define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind {
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: xorl %ebx, %ebx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: subw %dx, %ax
+; X86-NEXT: setbe %bl
+; X86-NEXT: leal -1(%ebx,%ebx), %edx
; X86-NEXT: ja .LBB11_2
; X86-NEXT: # %bb.1:
; X86-NEXT: negl %eax
; X86-NEXT: .LBB11_2:
-; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: cmpw %cx, %dx
-; X86-NEXT: setae %bl
-; X86-NEXT: leal -1(%ebx,%ebx), %edx
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: shrl %eax
; X86-NEXT: imull %edx, %eax
@@ -777,16 +766,15 @@ define i16 @scalar_i16_signed_mem_reg(ptr %a1_addr, i16 %a2) nounwind {
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl (%eax), %ecx
+; X86-NEXT: xorl %ebx, %ebx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: subw %dx, %ax
+; X86-NEXT: setle %bl
+; X86-NEXT: leal -1(%ebx,%ebx), %edx
; X86-NEXT: jg .LBB12_2
; X86-NEXT: # %bb.1:
; X86-NEXT: negl %eax
; X86-NEXT: .LBB12_2:
-; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: cmpw %dx, %cx
-; X86-NEXT: setle %bl
-; X86-NEXT: leal -1(%ebx,%ebx), %edx
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: shrl %eax
; X86-NEXT: imull %edx, %eax
@@ -829,19 +817,18 @@ define i16 @scalar_i16_signed_reg_mem(i16 %a1, ptr %a2_addr) nounwind {
; X86-LABEL: scalar_i16_signed_reg_mem:
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl (%eax), %edx
+; X86-NEXT: xorl %ebx, %ebx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: subw %dx, %ax
+; X86-NEXT: setle %bl
+; X86-NEXT: leal -1(%ebx,%ebx), %edx
; X86-NEXT: jg .LBB13_2
; X86-NEXT: # %bb.1:
; X86-NEXT: negl %eax
; X86-NEXT: .LBB13_2:
-; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: cmpw %dx, %cx
-; X86-NEXT: setle %bl
-; X86-NEXT: leal -1(%ebx,%ebx), %edx
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: shrl %eax
; X86-NEXT: imull %edx, %eax
@@ -888,16 +875,15 @@ define i16 @scalar_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %ecx
; X86-NEXT: movzwl (%eax), %edx
+; X86-NEXT: xorl %ebx, %ebx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: subw %dx, %ax
+; X86-NEXT: setle %bl
+; X86-NEXT: leal -1(%ebx,%ebx), %edx
; X86-NEXT: jg .LBB14_2
; X86-NEXT: # %bb.1:
; X86-NEXT: negl %eax
; X86-NEXT: .LBB14_2:
-; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: cmpw %dx, %cx
-; X86-NEXT: setle %bl
-; X86-NEXT: leal -1(%ebx,%ebx), %edx
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: shrl %eax
; X86-NEXT: imull %edx, %eax
@@ -946,17 +932,16 @@ define i8 @scalar_i8_signed_reg_reg(i8 %a1, i8 %a2) nounwind {
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT: cmpb %ah, %cl
-; X86-NEXT: setg %dl
-; X86-NEXT: negb %dl
-; X86-NEXT: orb $1, %dl
; X86-NEXT: movb %cl, %al
; X86-NEXT: subb %ah, %al
+; X86-NEXT: setg %dl
; X86-NEXT: jg .LBB15_2
; X86-NEXT: # %bb.1:
; X86-NEXT: subb %cl, %ah
; X86-NEXT: movb %ah, %al
; X86-NEXT: .LBB15_2:
+; X86-NEXT: negb %dl
+; X86-NEXT: orb $1, %dl
; X86-NEXT: shrb %al
; X86-NEXT: mulb %dl
; X86-NEXT: addb %cl, %al
@@ -993,18 +978,17 @@ define i8 @scalar_i8_unsigned_reg_reg(i8 %a1, i8 %a2) nounwind {
; X86-LABEL: scalar_i8_unsigned_reg_reg:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: movb %ch, %ah
-; X86-NEXT: subb %cl, %ah
-; X86-NEXT: sbbl %edx, %edx
-; X86-NEXT: orb $1, %dl
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
; X86-NEXT: movb %cl, %al
-; X86-NEXT: subb %ch, %al
+; X86-NEXT: subb %ah, %al
+; X86-NEXT: seta %dl
; X86-NEXT: ja .LBB16_2
; X86-NEXT: # %bb.1:
+; X86-NEXT: subb %cl, %ah
; X86-NEXT: movb %ah, %al
; X86-NEXT: .LBB16_2:
+; X86-NEXT: negb %dl
+; X86-NEXT: orb $1, %dl
; X86-NEXT: shrb %al
; X86-NEXT: mulb %dl
; X86-NEXT: addb %cl, %al
@@ -1046,17 +1030,16 @@ define i8 @scalar_i8_signed_mem_reg(ptr %a1_addr, i8 %a2) nounwind {
; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzbl (%ecx), %ecx
-; X86-NEXT: cmpb %ah, %cl
-; X86-NEXT: setg %dl
-; X86-NEXT: negb %dl
-; X86-NEXT: orb $1, %dl
; X86-NEXT: movb %cl, %al
; X86-NEXT: subb %ah, %al
+; X86-NEXT: setg %dl
; X86-NEXT: jg .LBB17_2
; X86-NEXT: # %bb.1:
; X86-NEXT: subb %cl, %ah
; X86-NEXT: movb %ah, %al
; X86-NEXT: .LBB17_2:
+; X86-NEXT: negb %dl
+; X86-NEXT: orb $1, %dl
; X86-NEXT: shrb %al
; X86-NEXT: mulb %dl
; X86-NEXT: addb %cl, %al
@@ -1096,17 +1079,16 @@ define i8 @scalar_i8_signed_reg_mem(i8 %a1, ptr %a2_addr) nounwind {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb (%eax), %ah
-; X86-NEXT: cmpb %ah, %cl
-; X86-NEXT: setg %dl
-; X86-NEXT: negb %dl
-; X86-NEXT: orb $1, %dl
; X86-NEXT: movb %cl, %al
; X86-NEXT: subb %ah, %al
+; X86-NEXT: setg %dl
; X86-NEXT: jg .LBB18_2
; X86-NEXT: # %bb.1:
; X86-NEXT: subb %cl, %ah
; X86-NEXT: movb %ah, %al
; X86-NEXT: .LBB18_2:
+; X86-NEXT: negb %dl
+; X86-NEXT: orb $1, %dl
; X86-NEXT: shrb %al
; X86-NEXT: mulb %dl
; X86-NEXT: addb %cl, %al
@@ -1148,17 +1130,16 @@ define i8 @scalar_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzbl (%ecx), %ecx
; X86-NEXT: movb (%eax), %ah
-; X86-NEXT: cmpb %ah, %cl
-; X86-NEXT: setg %dl
-; X86-NEXT: negb %dl
-; X86-NEXT: orb $1, %dl
; X86-NEXT: movb %cl, %al
; X86-NEXT: subb %ah, %al
+; X86-NEXT: setg %dl
; X86-NEXT: jg .LBB19_2
; X86-NEXT: # %bb.1:
; X86-NEXT: subb %cl, %ah
; X86-NEXT: movb %ah, %al
; X86-NEXT: .LBB19_2:
+; X86-NEXT: negb %dl
+; X86-NEXT: orb $1, %dl
; X86-NEXT: shrb %al
; X86-NEXT: mulb %dl
; X86-NEXT: addb %cl, %al
diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll
index a1da40e7e7655..f53983036a016 100644
--- a/llvm/test/CodeGen/X86/oddsubvector.ll
+++ b/llvm/test/CodeGen/X86/oddsubvector.ll
@@ -155,10 +155,10 @@ define <16 x i32> @PR42819(ptr %a0) {
define void @PR42833() {
; SSE2-LABEL: PR42833:
; SSE2: # %bb.0:
-; SSE2-NEXT: movl b(%rip), %eax
-; SSE2-NEXT: movdqa c+128(%rip), %xmm0
; SSE2-NEXT: movdqa c+144(%rip), %xmm2
-; SSE2-NEXT: addl c+128(%rip), %eax
+; SSE2-NEXT: movdqa c+128(%rip), %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: addl b(%rip), %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: paddd %xmm0, %xmm3
@@ -191,10 +191,10 @@ define void @PR42833() {
;
; SSE42-LABEL: PR42833:
; SSE42: # %bb.0:
-; SSE42-NEXT: movl b(%rip), %eax
-; SSE42-NEXT: movdqa c+128(%rip), %xmm0
; SSE42-NEXT: movdqa c+144(%rip), %xmm1
-; SSE42-NEXT: addl c+128(%rip), %eax
+; SSE42-NEXT: movdqa c+128(%rip), %xmm0
+; SSE42-NEXT: movd %xmm0, %eax
+; SSE42-NEXT: addl b(%rip), %eax
; SSE42-NEXT: movd %eax, %xmm2
; SSE42-NEXT: paddd %xmm0, %xmm2
; SSE42-NEXT: movdqa d+144(%rip), %xmm3
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index b633c28a214b7..412455384e937 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -23,7 +23,7 @@ define void @f() nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $160, %esp
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movzbl (%eax), %eax
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index 894186f9b343b..1ab1a1a01e168 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -1094,26 +1094,25 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1
; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $1, %xmm1, %r11d
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: movzbl %al, %edx
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: vpextrb $1, %xmm1, %r13d
+; AVX2-NEXT: vmovd %xmm1, %esi
+; AVX2-NEXT: movl %esi, %eax
; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: subb %r11b, %al
-; AVX2-NEXT: vpextrb $2, %xmm1, %esi
-; AVX2-NEXT: subb %sil, %al
-; AVX2-NEXT: vpextrb $3, %xmm1, %r13d
; AVX2-NEXT: subb %r13b, %al
+; AVX2-NEXT: vpextrb $2, %xmm1, %edx
+; AVX2-NEXT: subb %dl, %al
+; AVX2-NEXT: vpextrb $3, %xmm1, %ebp
+; AVX2-NEXT: subb %bpl, %al
; AVX2-NEXT: vpextrb $4, %xmm1, %r12d
; AVX2-NEXT: subb %r12b, %al
; AVX2-NEXT: vpextrb $5, %xmm1, %r15d
; AVX2-NEXT: subb %r15b, %al
; AVX2-NEXT: vpextrb $6, %xmm1, %r14d
; AVX2-NEXT: subb %r14b, %al
-; AVX2-NEXT: vpextrb $7, %xmm1, %ebp
-; AVX2-NEXT: subb %bpl, %al
-; AVX2-NEXT: vpextrb $8, %xmm1, %ebx
+; AVX2-NEXT: vpextrb $7, %xmm1, %ebx
; AVX2-NEXT: subb %bl, %al
+; AVX2-NEXT: vpextrb $8, %xmm1, %r11d
+; AVX2-NEXT: subb %r11b, %al
; AVX2-NEXT: vpextrb $9, %xmm1, %r10d
; AVX2-NEXT: subb %r10b, %al
; AVX2-NEXT: vpextrb $10, %xmm1, %r9d
@@ -1123,108 +1122,94 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
; AVX2-NEXT: vpextrb $12, %xmm1, %edi
; AVX2-NEXT: subb %dil, %al
; AVX2-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: subb %cl, %al
; AVX2-NEXT: vpextrb $14, %xmm1, %ecx
-; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: subb %cl, %al
; AVX2-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX2-NEXT: subb %cl, %al
; AVX2-NEXT: movzbl %al, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: movzbl -40(%rsp,%rax), %eax
; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX2-NEXT: vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: andl $1, %esi
+; AVX2-NEXT: vpextrb $1, %xmm0, -40(%rsp,%rsi)
+; AVX2-NEXT: andl $1, %r13d
+; AVX2-NEXT: addq %rsi, %r13
+; AVX2-NEXT: vpextrb $2, %xmm0, -40(%rsp,%r13)
; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: vpextrb $1, %xmm0, -40(%rsp,%rdx)
-; AVX2-NEXT: movzbl %r11b, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: vpextrb $2, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT: movzbl %sil, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: vpextrb $3, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT: movzbl %r13b, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vpextrb $4, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT: movzbl %r12b, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: movzbl %r15b, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $5, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT: movzbl %r14b, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $6, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT: movzbl %bpl, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $7, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT: movzbl %bl, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $8, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT: movzbl %r10b, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $9, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT: movzbl %r9b, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $10, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT: movzbl %r8b, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $11, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT: movzbl %dil, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $12, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX2-NEXT: addq %r13, %rdx
+; AVX2-NEXT: vpextrb $3, %xmm0, -40(%rsp,%rdx)
+; AVX2-NEXT: andl $1, %ebp
+; AVX2-NEXT: addq %rdx, %rbp
+; AVX2-NEXT: vpextrb $4, %xmm0, -40(%rsp,%rbp)
+; AVX2-NEXT: andl $1, %r12d
+; AVX2-NEXT: addq %rbp, %r12
+; AVX2-NEXT: andl $1, %r15d
+; AVX2-NEXT: addq %r12, %r15
+; AVX2-NEXT: # kill: def $r12d killed $r12d killed $r12 def $r12
+; AVX2-NEXT: andl $15, %r12d
+; AVX2-NEXT: vpextrb $5, %xmm0, -40(%rsp,%r12)
+; AVX2-NEXT: andl $1, %r14d
+; AVX2-NEXT: addq %r15, %r14
+; AVX2-NEXT: # kill: def $r15d killed $r15d killed $r15 def $r15
+; AVX2-NEXT: andl $15, %r15d
+; AVX2-NEXT: vpextrb $6, %xmm0, -40(%rsp,%r15)
+; AVX2-NEXT: andl $1, %ebx
+; AVX2-NEXT: addq %r14, %rbx
+; AVX2-NEXT: # kill: def $r14d killed $r14d killed $r14 def $r14
+; AVX2-NEXT: andl $15, %r14d
+; AVX2-NEXT: vpextrb $7, %xmm0, -40(%rsp,%r14)
+; AVX2-NEXT: andl $1, %r11d
+; AVX2-NEXT: addq %rbx, %r11
+; AVX2-NEXT: # kill: def $ebx killed $ebx killed $rbx def $rbx
+; AVX2-NEXT: andl $15, %ebx
+; AVX2-NEXT: vpextrb $8, %xmm0, -40(%rsp,%rbx)
+; AVX2-NEXT: andl $1, %r10d
+; AVX2-NEXT: addq %r11, %r10
+; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11
+; AVX2-NEXT: andl $15, %r11d
+; AVX2-NEXT: vpextrb $9, %xmm0, -40(%rsp,%r11)
+; AVX2-NEXT: andl $1, %r9d
+; AVX2-NEXT: addq %r10, %r9
+; AVX2-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10
+; AVX2-NEXT: andl $15, %r10d
+; AVX2-NEXT: vpextrb $10, %xmm0, -40(%rsp,%r10)
+; AVX2-NEXT: andl $1, %r8d
+; AVX2-NEXT: addq %r9, %r8
+; AVX2-NEXT: # kill: def $r9d killed $r9d killed $r9 def $r9
+; AVX2-NEXT: andl $15, %r9d
+; AVX2-NEXT: vpextrb $11, %xmm0, -40(%rsp,%r9)
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: addq %r8, %rdi
+; AVX2-NEXT: # kill: def $r8d killed $r8d killed $r8 def $r8
+; AVX2-NEXT: andl $15, %r8d
+; AVX2-NEXT: vpextrb $12, %xmm0, -40(%rsp,%r8)
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT: andl $1, %esi
+; AVX2-NEXT: addq %rdi, %rsi
+; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
+; AVX2-NEXT: andl $15, %edi
+; AVX2-NEXT: vpextrb $13, %xmm0, -40(%rsp,%rdi)
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $13, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; AVX2-NEXT: addq %rsi, %rax
+; AVX2-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi
+; AVX2-NEXT: andl $15, %esi
+; AVX2-NEXT: vpextrb $14, %xmm0, -40(%rsp,%rsi)
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpextrb $14, %xmm0, -40(%rsp,%rax)
-; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrb $15, %xmm0, -40(%rsp,%rcx)
-; AVX2-NEXT: cmpq $15, %rax
-; AVX2-NEXT: movl $15, %ecx
-; AVX2-NEXT: cmovbq %rax, %rcx
-; AVX2-NEXT: vpextrb $15, %xmm0, %eax
-; AVX2-NEXT: cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
-; AVX2-NEXT: movb %al, -40(%rsp,%rcx)
+; AVX2-NEXT: vpextrb $15, %xmm0, -40(%rsp,%rax)
+; AVX2-NEXT: cmpq $15, %rcx
+; AVX2-NEXT: movl $15, %eax
+; AVX2-NEXT: cmovbq %rcx, %rax
+; AVX2-NEXT: vpextrb $15, %xmm0, %ecx
+; AVX2-NEXT: cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
+; AVX2-NEXT: movb %cl, -40(%rsp,%rax)
; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
@@ -1805,140 +1790,137 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $128, %rsp
-; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9
-; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8
-; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movl %ecx, %r13d
-; AVX2-NEXT: movl %edx, %r15d
-; AVX2-NEXT: movl %esi, %ebx
+; AVX2-NEXT: subq $96, %rsp
+; AVX2-NEXT: movl %r9d, %r11d
+; AVX2-NEXT: movl %r8d, %r10d
+; AVX2-NEXT: movl %ecx, %r9d
+; AVX2-NEXT: movl %edx, %r8d
+; AVX2-NEXT: # kill: def $esi killed $esi def $rsi
; AVX2-NEXT: # kill: def $edi killed $edi def $rdi
-; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movl 360(%rbp), %eax
-; AVX2-NEXT: movl 352(%rbp), %ecx
+; AVX2-NEXT: movzbl 360(%rbp), %eax
+; AVX2-NEXT: movzbl 352(%rbp), %ecx
; AVX2-NEXT: vmovd %ecx, %xmm4
; AVX2-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 368(%rbp), %eax
+; AVX2-NEXT: movzbl 368(%rbp), %eax
; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 376(%rbp), %eax
+; AVX2-NEXT: movzbl 376(%rbp), %eax
; AVX2-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 384(%rbp), %eax
+; AVX2-NEXT: movzbl 384(%rbp), %eax
; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 392(%rbp), %eax
+; AVX2-NEXT: movzbl 392(%rbp), %eax
; AVX2-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 400(%rbp), %eax
+; AVX2-NEXT: movzbl 400(%rbp), %eax
; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 408(%rbp), %eax
+; AVX2-NEXT: movzbl 408(%rbp), %eax
; AVX2-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 416(%rbp), %eax
+; AVX2-NEXT: movzbl 416(%rbp), %eax
; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 424(%rbp), %eax
+; AVX2-NEXT: movzbl 424(%rbp), %eax
; AVX2-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 432(%rbp), %eax
+; AVX2-NEXT: movzbl 432(%rbp), %eax
; AVX2-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 440(%rbp), %eax
+; AVX2-NEXT: movzbl 440(%rbp), %eax
; AVX2-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 448(%rbp), %eax
+; AVX2-NEXT: movzbl 448(%rbp), %eax
; AVX2-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 456(%rbp), %eax
+; AVX2-NEXT: movzbl 456(%rbp), %eax
; AVX2-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 464(%rbp), %eax
+; AVX2-NEXT: movzbl 464(%rbp), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 472(%rbp), %eax
+; AVX2-NEXT: movzbl 472(%rbp), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movl 224(%rbp), %eax
+; AVX2-NEXT: movzbl 224(%rbp), %eax
; AVX2-NEXT: vmovd %eax, %xmm5
-; AVX2-NEXT: movl 232(%rbp), %eax
+; AVX2-NEXT: movzbl 232(%rbp), %eax
; AVX2-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 240(%rbp), %eax
+; AVX2-NEXT: movzbl 240(%rbp), %eax
; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 248(%rbp), %eax
+; AVX2-NEXT: movzbl 248(%rbp), %eax
; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 256(%rbp), %eax
+; AVX2-NEXT: movzbl 256(%rbp), %eax
; AVX2-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 264(%rbp), %eax
+; AVX2-NEXT: movzbl 264(%rbp), %eax
; AVX2-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 272(%rbp), %eax
+; AVX2-NEXT: movzbl 272(%rbp), %eax
; AVX2-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 280(%rbp), %eax
+; AVX2-NEXT: movzbl 280(%rbp), %eax
; AVX2-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 288(%rbp), %eax
+; AVX2-NEXT: movzbl 288(%rbp), %eax
; AVX2-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 296(%rbp), %eax
+; AVX2-NEXT: movzbl 296(%rbp), %eax
; AVX2-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 304(%rbp), %eax
+; AVX2-NEXT: movzbl 304(%rbp), %eax
; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 312(%rbp), %eax
+; AVX2-NEXT: movzbl 312(%rbp), %eax
; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 320(%rbp), %eax
+; AVX2-NEXT: movzbl 320(%rbp), %eax
; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 328(%rbp), %eax
+; AVX2-NEXT: movzbl 328(%rbp), %eax
; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 336(%rbp), %eax
+; AVX2-NEXT: movzbl 336(%rbp), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 344(%rbp), %eax
+; AVX2-NEXT: movzbl 344(%rbp), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX2-NEXT: movl 96(%rbp), %eax
-; AVX2-NEXT: vmovd %eax, %xmm5
-; AVX2-NEXT: movl 104(%rbp), %eax
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 112(%rbp), %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 120(%rbp), %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 128(%rbp), %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 136(%rbp), %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 144(%rbp), %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 152(%rbp), %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 160(%rbp), %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 168(%rbp), %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 176(%rbp), %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 184(%rbp), %eax
+; AVX2-NEXT: vmovd %edi, %xmm5
+; AVX2-NEXT: vpinsrb $1, %esi, %xmm5, %xmm5
+; AVX2-NEXT: vpinsrb $2, %edx, %xmm5, %xmm5
+; AVX2-NEXT: vpinsrb $3, %r9d, %xmm5, %xmm5
+; AVX2-NEXT: vpinsrb $4, %r10d, %xmm5, %xmm5
+; AVX2-NEXT: vpinsrb $5, %r11d, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 16(%rbp), %ebx
+; AVX2-NEXT: vpinsrb $6, %ebx, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 24(%rbp), %r14d
+; AVX2-NEXT: vpinsrb $7, %r14d, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 32(%rbp), %r15d
+; AVX2-NEXT: vpinsrb $8, %r15d, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 40(%rbp), %r12d
+; AVX2-NEXT: vpinsrb $9, %r12d, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 48(%rbp), %r13d
+; AVX2-NEXT: vpinsrb $10, %r13d, %xmm5, %xmm5
+; AVX2-NEXT: movzbl 56(%rbp), %eax
; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 192(%rbp), %eax
+; AVX2-NEXT: movzbl 64(%rbp), %eax
; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 200(%rbp), %eax
+; AVX2-NEXT: movzbl 72(%rbp), %eax
; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 208(%rbp), %eax
+; AVX2-NEXT: movzbl 80(%rbp), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movl 216(%rbp), %eax
+; AVX2-NEXT: movzbl 88(%rbp), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX2-NEXT: vmovd %edi, %xmm6
-; AVX2-NEXT: vpinsrb $1, %esi, %xmm6, %xmm6
-; AVX2-NEXT: vpinsrb $2, %edx, %xmm6, %xmm6
-; AVX2-NEXT: vpinsrb $3, %r13d, %xmm6, %xmm6
-; AVX2-NEXT: vpinsrb $4, %r8d, %xmm6, %xmm6
-; AVX2-NEXT: vpinsrb $5, %r9d, %xmm6, %xmm6
-; AVX2-NEXT: movl 16(%rbp), %esi
-; AVX2-NEXT: vpinsrb $6, %esi, %xmm6, %xmm6
-; AVX2-NEXT: movl 24(%rbp), %edi
-; AVX2-NEXT: vpinsrb $7, %edi, %xmm6, %xmm6
-; AVX2-NEXT: movl 32(%rbp), %r8d
-; AVX2-NEXT: vpinsrb $8, %r8d, %xmm6, %xmm6
-; AVX2-NEXT: movl 40(%rbp), %r9d
-; AVX2-NEXT: vpinsrb $9, %r9d, %xmm6, %xmm6
-; AVX2-NEXT: movl 48(%rbp), %r10d
-; AVX2-NEXT: vpinsrb $10, %r10d, %xmm6, %xmm6
-; AVX2-NEXT: movl 56(%rbp), %r11d
-; AVX2-NEXT: vpinsrb $11, %r11d, %xmm6, %xmm6
-; AVX2-NEXT: movl 64(%rbp), %r14d
-; AVX2-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6
-; AVX2-NEXT: movl 72(%rbp), %r12d
-; AVX2-NEXT: vpinsrb $13, %r12d, %xmm6, %xmm6
-; AVX2-NEXT: movl 80(%rbp), %eax
+; AVX2-NEXT: movzbl 96(%rbp), %eax
+; AVX2-NEXT: vmovd %eax, %xmm6
+; AVX2-NEXT: movzbl 104(%rbp), %eax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 112(%rbp), %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 120(%rbp), %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 128(%rbp), %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 136(%rbp), %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 144(%rbp), %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 152(%rbp), %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 160(%rbp), %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 168(%rbp), %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 176(%rbp), %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 184(%rbp), %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 192(%rbp), %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 200(%rbp), %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl 208(%rbp), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movl 88(%rbp), %eax
+; AVX2-NEXT: movzbl 216(%rbp), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm6, %xmm6
-; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX2-NEXT: vpand %ymm6, %ymm4, %ymm4
@@ -1980,379 +1962,435 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX2-NEXT: vmovaps %ymm2, (%rsp)
; AVX2-NEXT: movzbl %al, %eax
; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: movzbl (%rsp,%rax), %eax
-; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: movzbl (%rsp,%rax), %edx
; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp)
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: andl $1, %ebx
-; AVX2-NEXT: addq %rax, %rbx
-; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rbx)
-; AVX2-NEXT: andl $1, %r15d
-; AVX2-NEXT: addq %rbx, %r15
-; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%r15)
-; AVX2-NEXT: andl $1, %r13d
-; AVX2-NEXT: addq %r15, %r13
-; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%r13)
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %r13, %rcx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: andl $1, %esi
-; AVX2-NEXT: addq %rax, %rsi
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax)
; AVX2-NEXT: andl $1, %edi
-; AVX2-NEXT: addq %rsi, %rdi
-; AVX2-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi
-; AVX2-NEXT: andl $63, %esi
-; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rsi)
+; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rdi)
+; AVX2-NEXT: andl $1, %esi
+; AVX2-NEXT: addq %rdi, %rsi
+; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rsi)
; AVX2-NEXT: andl $1, %r8d
-; AVX2-NEXT: addq %rdi, %r8
-; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
-; AVX2-NEXT: andl $63, %edi
-; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rdi)
+; AVX2-NEXT: addq %rsi, %r8
+; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%r8)
; AVX2-NEXT: andl $1, %r9d
; AVX2-NEXT: addq %r8, %r9
-; AVX2-NEXT: # kill: def $r8d killed $r8d killed $r8 def $r8
-; AVX2-NEXT: andl $63, %r8d
-; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%r8)
+; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%r9)
; AVX2-NEXT: andl $1, %r10d
; AVX2-NEXT: addq %r9, %r10
-; AVX2-NEXT: # kill: def $r9d killed $r9d killed $r9 def $r9
-; AVX2-NEXT: andl $63, %r9d
-; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%r9)
+; AVX2-NEXT: movl %r10d, %eax
+; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
; AVX2-NEXT: andl $1, %r11d
; AVX2-NEXT: addq %r10, %r11
-; AVX2-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10
-; AVX2-NEXT: andl $63, %r10d
-; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%r10)
-; AVX2-NEXT: andl $1, %r14d
-; AVX2-NEXT: addq %r11, %r14
+; AVX2-NEXT: movzbl %bl, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %r11, %rax
; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11
; AVX2-NEXT: andl $63, %r11d
-; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%r11)
-; AVX2-NEXT: andl $1, %r12d
-; AVX2-NEXT: addq %r14, %r12
-; AVX2-NEXT: # kill: def $r14d killed $r14d killed $r14 def $r14
-; AVX2-NEXT: andl $63, %r14d
-; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%r14)
-; AVX2-NEXT: movl 80(%rbp), %eax
+; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%r11)
+; AVX2-NEXT: movzbl %r14b, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movzbl %r15b, %eax
; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %r12, %rax
-; AVX2-NEXT: # kill: def $r12d killed $r12d killed $r12 def $r12
-; AVX2-NEXT: andl $63, %r12d
-; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%r12)
-; AVX2-NEXT: movl 88(%rbp), %ecx
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl %r12b, %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 96(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movzbl %r13b, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 56(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movzbl 64(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 72(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movzbl 80(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 88(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movzbl 96(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 104(%rbp), %ecx
+; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 104(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 112(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 120(%rbp), %ecx
+; AVX2-NEXT: movzbl 112(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 120(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 128(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 136(%rbp), %ecx
+; AVX2-NEXT: movzbl 128(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 136(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 144(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 152(%rbp), %ecx
+; AVX2-NEXT: movzbl 144(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 152(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 160(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 168(%rbp), %ecx
+; AVX2-NEXT: movzbl 160(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 168(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 176(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 184(%rbp), %ecx
+; AVX2-NEXT: movzbl 176(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 184(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 192(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 200(%rbp), %ecx
+; AVX2-NEXT: movzbl 192(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 200(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 208(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 216(%rbp), %ecx
+; AVX2-NEXT: movzbl 208(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 216(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 224(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $0, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 232(%rbp), %ecx
+; AVX2-NEXT: movzbl 224(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $0, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 232(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $1, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 240(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $2, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 248(%rbp), %ecx
+; AVX2-NEXT: movzbl 240(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $2, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 248(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $3, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 256(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $4, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 264(%rbp), %ecx
+; AVX2-NEXT: movzbl 256(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $4, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 264(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $5, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 272(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $6, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 280(%rbp), %ecx
+; AVX2-NEXT: movzbl 272(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $6, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 280(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $7, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 288(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $8, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 296(%rbp), %ecx
+; AVX2-NEXT: movzbl 288(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $8, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 296(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $9, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 304(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $10, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 312(%rbp), %ecx
+; AVX2-NEXT: movzbl 304(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $10, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 312(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $11, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 320(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $12, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 328(%rbp), %ecx
+; AVX2-NEXT: movzbl 320(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $12, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 328(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $13, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 336(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $14, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 344(%rbp), %ecx
+; AVX2-NEXT: movzbl 336(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $14, %xmm1, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 344(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $15, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movl 352(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: movzbl 352(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 360(%rbp), %ecx
+; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 360(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 368(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 376(%rbp), %ecx
+; AVX2-NEXT: movzbl 368(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 376(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 384(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 392(%rbp), %ecx
+; AVX2-NEXT: movzbl 384(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 392(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 400(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 408(%rbp), %ecx
+; AVX2-NEXT: movzbl 400(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 408(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 416(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 424(%rbp), %ecx
+; AVX2-NEXT: movzbl 416(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 424(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 432(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 440(%rbp), %ecx
+; AVX2-NEXT: movzbl 432(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 440(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 448(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 456(%rbp), %ecx
+; AVX2-NEXT: movzbl 448(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 456(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 464(%rbp), %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movl 472(%rbp), %ecx
+; AVX2-NEXT: movzbl 464(%rbp), %eax
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx)
+; AVX2-NEXT: movzbl 472(%rbp), %ecx
+; AVX2-NEXT: movzbl %cl, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax)
; AVX2-NEXT: vpextrb $15, %xmm0, %eax
; AVX2-NEXT: cmpq $64, %rcx
-; AVX2-NEXT: cmovbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; AVX2-NEXT: cmovbl %edx, %eax
; AVX2-NEXT: cmpq $63, %rcx
-; AVX2-NEXT: movq %rcx, %rdx
-; AVX2-NEXT: movl $63, %ecx
-; AVX2-NEXT: cmovbq %rdx, %rcx
-; AVX2-NEXT: movb %al, (%rsp,%rcx)
+; AVX2-NEXT: movl $63, %edx
+; AVX2-NEXT: cmovbq %rcx, %rdx
+; AVX2-NEXT: movb %al, (%rsp,%rdx)
; AVX2-NEXT: vmovaps (%rsp), %ymm0
; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
; AVX2-NEXT: leaq -40(%rbp), %rsp
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index e60b56551e58d..d0690bd291f31 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -509,10 +509,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrlw $7, %xmm3
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: paddb %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: paddb %xmm2, %xmm4
+; SSE2-NEXT: psrlw $7, %xmm4
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
; SSE2-NEXT: por %xmm3, %xmm4
; SSE2-NEXT: paddb %xmm1, %xmm1
; SSE2-NEXT: pcmpgtb %xmm1, %xmm0
@@ -545,10 +545,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $7, %xmm0
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: paddb %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm3
+; SSE41-NEXT: psrlw $7, %xmm3
+; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE41-NEXT: por %xmm0, %xmm3
; SSE41-NEXT: paddb %xmm2, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
@@ -572,10 +572,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $7, %xmm0, %xmm2
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm3
-; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -704,10 +704,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; X86-SSE2-NEXT: pandn %xmm3, %xmm2
; X86-SSE2-NEXT: por %xmm4, %xmm2
; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT: psrlw $7, %xmm3
-; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT: paddb %xmm2, %xmm3
; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
-; X86-SSE2-NEXT: paddb %xmm2, %xmm4
+; X86-SSE2-NEXT: psrlw $7, %xmm4
+; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
; X86-SSE2-NEXT: por %xmm3, %xmm4
; X86-SSE2-NEXT: paddb %xmm1, %xmm1
; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 11a02f8cf754c..421fa98709d48 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -431,10 +431,10 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3
-; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -451,10 +451,10 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
-; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
-; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
+; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index d9799975cd37a..4969cb500d4df 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -533,10 +533,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: psrlw $7, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: paddb %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: paddb %xmm2, %xmm4
+; SSE2-NEXT: psrlw $7, %xmm4
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
; SSE2-NEXT: por %xmm1, %xmm4
; SSE2-NEXT: paddb %xmm3, %xmm3
; SSE2-NEXT: pcmpgtb %xmm3, %xmm0
@@ -568,10 +568,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; SSE41-NEXT: paddb %xmm0, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrlw $7, %xmm1
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: paddb %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddb %xmm2, %xmm3
+; SSE41-NEXT: psrlw $7, %xmm3
+; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE41-NEXT: por %xmm1, %xmm3
; SSE41-NEXT: paddb %xmm0, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
@@ -596,10 +596,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $7, %xmm0, %xmm2
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm3
-; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -731,10 +731,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; X86-SSE2-NEXT: pandn %xmm1, %xmm2
; X86-SSE2-NEXT: por %xmm4, %xmm2
; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT: psrlw $7, %xmm1
-; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: paddb %xmm2, %xmm1
; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
-; X86-SSE2-NEXT: paddb %xmm2, %xmm4
+; X86-SSE2-NEXT: psrlw $7, %xmm4
+; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
; X86-SSE2-NEXT: por %xmm1, %xmm4
; X86-SSE2-NEXT: paddb %xmm3, %xmm3
; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index 15e09c3b6737e..e2a3e261c0411 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -457,10 +457,10 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3
-; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index 6c79be75550ed..93f4ce7573ad1 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -442,10 +442,10 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrlw $7, %xmm3
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: paddb %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: paddb %xmm2, %xmm4
+; SSE2-NEXT: psrlw $7, %xmm4
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
; SSE2-NEXT: por %xmm3, %xmm4
; SSE2-NEXT: paddb %xmm1, %xmm1
; SSE2-NEXT: pcmpgtb %xmm1, %xmm0
@@ -478,10 +478,10 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $7, %xmm0
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: paddb %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm3
+; SSE41-NEXT: psrlw $7, %xmm3
+; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE41-NEXT: por %xmm0, %xmm3
; SSE41-NEXT: paddb %xmm2, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
@@ -505,10 +505,10 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $7, %xmm0, %xmm2
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm3
-; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -637,10 +637,10 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X86-SSE2-NEXT: pandn %xmm3, %xmm2
; X86-SSE2-NEXT: por %xmm4, %xmm2
; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT: psrlw $7, %xmm3
-; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT: paddb %xmm2, %xmm3
; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
-; X86-SSE2-NEXT: paddb %xmm2, %xmm4
+; X86-SSE2-NEXT: psrlw $7, %xmm4
+; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
; X86-SSE2-NEXT: por %xmm3, %xmm4
; X86-SSE2-NEXT: paddb %xmm1, %xmm1
; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index 684721f434ebd..64c31187f29ef 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -375,10 +375,10 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3
-; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -395,10 +395,10 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
-; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
-; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
+; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: retq
More information about the llvm-commits
mailing list