[llvm] peephole-opt: Fold uses of REG_SEQUENCE subregisters (WIP) (PR #161225)
Frederik Harwath via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 29 09:25:19 PDT 2025
https://github.com/frederik-h created https://github.com/llvm/llvm-project/pull/161225
The goal of this PR is to free si-peephole-sdwa from the need to deal with REG_SEQUENCE instructions. This will fix issue #130102. PR #133087 attempted to handle those instructions in si-peephole-sdwa which is cumbersome. It has been suggested that we could actually get rid of the instructions in many cases before si-peephole-sdwa instead.
This work-in-progress PR adapts the existing copy coalescing logic in peephole-opt to rewrite uses of subregisters of registers defined by REG_SEQUENCE instructions into direct uses of the registers defining the subregisters.
>From c9f2126ce0c12fd2441db72b0a18934d6a2bfc28 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Fri, 26 Sep 2025 03:20:15 -0400
Subject: [PATCH] peephole-opt: Fold uses of REG_SEQUENCE subregisters (WIP)
The goal of this PR is to free si-peephole-sdwa from the need to deal
with REG_SEQUENCE instructions. This will fix issue #130102. PR
si-peephole-sdwa which is cumbersome. It has been suggested that we
could actually get rid of the instructions in many cases before
si-peephole-sdwa instead.
This work-in-progress PR adapts the existing copy coalescing logic in
peephole-opt to rewrite uses of subregisters of registers defined by
REG_SEQUENCE instructions into direct uses of the registers defining
the subregisters.
The cases that should be handled by this optimization still need to be
narrowed down.
---
llvm/lib/CodeGen/PeepholeOptimizer.cpp | 72 ++
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 42 +-
llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 693 +++++-----
llvm/test/CodeGen/AMDGPU/load-global-i8.ll | 22 +-
llvm/test/CodeGen/AMDGPU/load-local-i16.ll | 1120 ++++++++---------
.../AMDGPU/sdwa-peephole-reg-sequence.mir | 134 ++
6 files changed, 1115 insertions(+), 968 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index fb3e6482bb096..6104eadb05f59 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -220,6 +220,58 @@ class CopyRewriter : public Rewriter {
}
};
+/// Rewriter for REG_SEQUENCE source operands. This class is used for
+/// rewriting uses of REG_SEQUENCE operands in arbitrary instructions
+/// whereas the RegSequenceRewriter is used for rewriting REG_SEQUENCE
+/// instructions.
+class RegSequenceSrcRewriter : public Rewriter {
+private:
+ MachineRegisterInfo &MRI;
+ const MachineOperand &MODef;
+
+public:
+ RegSequenceSrcRewriter(MachineInstr &MI, MachineRegisterInfo &MRI)
+ : Rewriter(MI), MRI(MRI), MODef(CopyLike.getOperand(0)) {
+ assert(MODef.isReg() && !MODef.getReg().isPhysical());
+ }
+ virtual ~RegSequenceSrcRewriter() = default;
+
+ bool getNextRewritableSource(RegSubRegPair &Src,
+ RegSubRegPair &Dst) override {
+ long NumOperands = static_cast<long>(CopyLike.getNumOperands());
+ const MachineOperand &MODef = CopyLike.getOperand(0);
+ Dst = RegSubRegPair(MODef.getReg(), MODef.getSubReg());
+
+ for (CurrentSrcIdx++; CurrentSrcIdx < NumOperands; CurrentSrcIdx++) {
+ MachineOperand &Op = CopyLike.getOperand(CurrentSrcIdx);
+ if (!Op.isReg())
+ continue;
+
+ Register Reg = Op.getReg();
+ if (!Reg.isVirtual() || !MRI.getUniqueVRegDef(Reg)->isRegSequence())
+ continue;
+
+ break;
+ }
+
+ if (CurrentSrcIdx == NumOperands)
+ return false;
+
+ MachineOperand &Op = CopyLike.getOperand(CurrentSrcIdx);
+
+ Src = RegSubRegPair(Op.getReg(), Op.getSubReg());
+
+ return true;
+ }
+
+ bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override {
+ MachineOperand &MOSrc = CopyLike.getOperand(CurrentSrcIdx);
+ MOSrc.setReg(NewReg);
+ MOSrc.setSubReg(NewSubReg);
+ return true;
+ }
+};
+
/// Helper class to rewrite uncoalescable copy like instructions
/// into new COPY (coalescable friendly) instructions.
class UncoalescableRewriter : public Rewriter {
@@ -453,6 +505,7 @@ class PeepholeOptimizer : private MachineFunction::Delegate {
bool optimizeCoalescableCopyImpl(Rewriter &&CpyRewriter);
bool optimizeCoalescableCopy(MachineInstr &MI);
+ bool optimizeRegSequenceUses(MachineInstr &MI, MachineRegisterInfo &MRI);
bool optimizeUncoalescableCopy(MachineInstr &MI,
SmallPtrSetImpl<MachineInstr *> &LocalMIs);
bool optimizeRecurrence(MachineInstr &PHI);
@@ -1258,6 +1311,20 @@ bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr &MI) {
}
}
+bool PeepholeOptimizer::optimizeRegSequenceUses(MachineInstr &MI,
+ MachineRegisterInfo &MRI) {
+ if (MI.getNumOperands() < 1)
+ return false;
+
+ const MachineOperand &MODef = MI.getOperand(0);
+
+ // Do not rewrite physical definitions.
+ if (!MODef.isReg() || MODef.getReg().isPhysical())
+ return false;
+
+ return optimizeCoalescableCopyImpl(RegSequenceSrcRewriter(MI, MRI));
+}
+
/// Rewrite the source found through \p Def, by using the \p RewriteMap
/// and create a new COPY instruction. More info about RewriteMap in
/// PeepholeOptimizer::findNextSource. Right now this is only used to handle
@@ -1807,6 +1874,11 @@ bool PeepholeOptimizer::run(MachineFunction &MF) {
continue;
}
+ if (!MI->isCopy() && optimizeRegSequenceUses(*MI, *MRI)) {
+ LLVM_DEBUG(dbgs() << "Optimized REG_SEQUENCE uses: " << *MI << '\n');
+ Changed = true;
+ }
+
if (isMoveImmediate(*MI, ImmDefRegs, ImmDefMIs)) {
SeenMoveImm = true;
} else {
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index f93e5f06beff9..c4c80510823ec 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -5417,10 +5417,10 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_u8 v0, v4, s[2:3]
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_lshrrev_b32_e32 v2, 1, v0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 1, v0
; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 1
+; GFX1250-NEXT: v_bfe_i32 v2, v1, 0, 1
; GFX1250-NEXT: v_dual_ashrrev_i32 v1, 31, v0 :: v_dual_ashrrev_i32 v3, 31, v2
; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX1250-NEXT: s_endpgm
@@ -5662,23 +5662,21 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
; GFX1250-LABEL: constant_sextload_v3i1_to_v3i64:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: v_mov_b32_e32 v5, 0
+; GFX1250-NEXT: v_mov_b32_e32 v6, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_u8 v0, v5, s[2:3]
+; GFX1250-NEXT: global_load_u8 v0, v6, s[2:3]
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_dual_lshrrev_b32 v2, 2, v0 :: v_dual_lshrrev_b32 v4, 1, v0
+; GFX1250-NEXT: v_dual_lshrrev_b32 v1, 2, v0 :: v_dual_lshrrev_b32 v2, 1, v0
; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_bfe_i32 v6, v2, 0, 1
+; GFX1250-NEXT: v_bfe_i32 v4, v1, 0, 1
; GFX1250-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_bfe_i32 v2, v4, 0, 1
-; GFX1250-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1250-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 1
+; GFX1250-NEXT: v_dual_ashrrev_i32 v5, 31, v4 :: v_dual_ashrrev_i32 v3, 31, v2
; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_store_b64 v5, v[6:7], s[0:1] offset:16
-; GFX1250-NEXT: global_store_b128 v5, v[0:3], s[0:1]
+; GFX1250-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v6, v[0:3], s[0:1]
; GFX1250-NEXT: s_endpgm
%load = load <3 x i1>, ptr addrspace(4) %in
%ext = sext <3 x i1> %load to <3 x i64>
@@ -5949,24 +5947,24 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX1250-LABEL: constant_sextload_v4i1_to_v4i64:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: v_mov_b32_e32 v9, 0
+; GFX1250-NEXT: v_mov_b32_e32 v8, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_u8 v0, v9, s[2:3]
+; GFX1250-NEXT: global_load_u8 v0, v8, s[2:3]
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_dual_lshrrev_b32 v2, 3, v0 :: v_dual_lshrrev_b32 v4, 2, v0
-; GFX1250-NEXT: v_lshrrev_b32_e32 v8, 1, v0
+; GFX1250-NEXT: v_dual_lshrrev_b32 v1, 3, v0 :: v_dual_lshrrev_b32 v2, 2, v0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v3, 1, v0
; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_bfe_i32 v6, v2, 0, 1
-; GFX1250-NEXT: v_bfe_i32 v4, v4, 0, 1
+; GFX1250-NEXT: v_bfe_i32 v6, v1, 0, 1
+; GFX1250-NEXT: v_bfe_i32 v4, v2, 0, 1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_bfe_i32 v2, v8, 0, 1
+; GFX1250-NEXT: v_bfe_i32 v2, v3, 0, 1
; GFX1250-NEXT: v_dual_ashrrev_i32 v1, 31, v0 :: v_dual_ashrrev_i32 v7, 31, v6
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-NEXT: v_dual_ashrrev_i32 v5, 31, v4 :: v_dual_ashrrev_i32 v3, 31, v2
; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_store_b128 v9, v[4:7], s[0:1] offset:16
-; GFX1250-NEXT: global_store_b128 v9, v[0:3], s[0:1]
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX1250-NEXT: s_endpgm
%load = load <4 x i1>, ptr addrspace(4) %in
%ext = sext <4 x i1> %load to <4 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index bca39d06e941c..0e17e809fec90 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -5985,14 +5985,13 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v1
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 16, v1
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v2, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v3, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -6011,52 +6010,50 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GCN-HSA-NEXT: v_bfe_i32 v4, v3, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1
-; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[1:2], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v2
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v6, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v3, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v4i16_to_v4i64:
@@ -6362,7 +6359,6 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v3
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16
@@ -6372,7 +6368,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v3
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v3
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v5, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v3, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v7, 0, 16
@@ -6407,32 +6403,31 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, v3
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v8, v3, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1
+; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3
; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GCN-HSA-NEXT: s_endpgm
@@ -6451,12 +6446,11 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v3
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v3, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v11, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v3, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16
@@ -6980,27 +6974,25 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v3
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v7
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v5
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v5
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v5, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v7
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v7
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v6, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v6, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v5
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v5
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v5, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v7
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 16, v7
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v7, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v0, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v18, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v17, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v9, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v11, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v10, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v9, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v3, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v1
@@ -7011,22 +7003,22 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v2, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
@@ -7070,57 +7062,55 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 64
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v5
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7
-; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11]
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6
-; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v4
-; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
-; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v9, v18, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v7
+; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[7:10]
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v6
+; GCN-HSA-NEXT: v_bfe_i32 v7, v5, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v5
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v5
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[7:10]
; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v9, v11, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7
-; GCN-HSA-NEXT: v_bfe_i32 v6, v19, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v6, v14, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[7:10]
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, v3
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v0
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1
+; GCN-HSA-NEXT: v_bfe_i32 v4, v3, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v3
; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v2, v18, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT: v_bfe_i32 v2, v15, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v8, v15, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11]
; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15]
; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3]
; GCN-HSA-NEXT: s_endpgm
@@ -7146,48 +7136,46 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v1, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v5
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v5, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v3
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v3, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v7
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v5, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v6, 0, 16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v6
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v6, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v9, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v11, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v13, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v3, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v22, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v7, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v20, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v3, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v5, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v7, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 31, v29
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
@@ -8119,19 +8107,17 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v15
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v3
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v7
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v23, 0, 16
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v15
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v15
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v15, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:240
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
@@ -8140,11 +8126,10 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v13, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v11
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v27, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v3, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
@@ -8153,35 +8138,34 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:144
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v22, 0, 16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v7
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v7
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v7
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v7
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v7, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v5
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v5
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v5, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v5
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v5
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v5, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v11
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v11
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v11
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v11
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v11, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v12, 0, 16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v14, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v14, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v21, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v20, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v1, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v20, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v22, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v8, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v25
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:224
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v9
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v9, 0, 16
@@ -8240,8 +8224,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
@@ -8249,161 +8233,157 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
-; GCN-HSA-NEXT: v_bfe_i32 v16, v13, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v13
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19]
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v15
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v15
+; GCN-HSA-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[15:18]
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3
+; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0
-; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v16, v14, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v13
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v13
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19]
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, v15
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2
+; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[15:18]
+; GCN-HSA-NEXT: s_waitcnt vmcnt(4)
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v10
+; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v13, v14, 0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0
-; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[13:16]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v15
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[13:16]
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v12
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0
; GCN-HSA-NEXT: v_bfe_i32 v12, v12, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
-; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
-; GCN-HSA-NEXT: s_waitcnt vmcnt(6)
-; GCN-HSA-NEXT: v_bfe_i32 v12, v9, 0, 16
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v10
-; GCN-HSA-NEXT: v_mov_b32_e32 v29, v11
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v8
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v9
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
-; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
-; GCN-HSA-NEXT: v_bfe_i32 v16, v29, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v14, v18, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v8, v10, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v10, v28, 0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15]
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v11
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v11
+; GCN-HSA-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v8
+; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14]
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2
+; GCN-HSA-NEXT: v_bfe_i32 v11, v9, 0, 16
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v11
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v9
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v9
+; GCN-HSA-NEXT: v_bfe_i32 v15, v10, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GCN-HSA-NEXT: v_bfe_i32 v10, v28, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v17, v27, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15
; GCN-HSA-NEXT: s_waitcnt vmcnt(5)
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v2
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v2
+; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[11:14]
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
-; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15]
-; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v10, v20, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
+; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v14, v19, 0, 16
+; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[15:18]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[8:11]
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, v3
-; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[12:15]
+; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v3
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
-; GCN-HSA-NEXT: v_bfe_i32 v8, v21, 0, 16
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v0
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v4
-; GCN-HSA-NEXT: v_bfe_i32 v12, v4, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16
-; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[8:11]
-; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v5
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v5
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v1
+; GCN-HSA-NEXT: v_bfe_i32 v20, v1, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v24, v0, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v26, v26, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
+; GCN-HSA-NEXT: v_bfe_i32 v26, v27, 0, 16
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v24
-; GCN-HSA-NEXT: v_bfe_i32 v14, v27, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v26
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_bfe_i32 v20, v1, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v1
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v20
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[20:23]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, v7
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v4
+; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-HSA-NEXT: v_bfe_i32 v18, v18, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GCN-HSA-NEXT: v_bfe_i32 v0, v19, 0, 16
+; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[10:13]
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT: v_bfe_i32 v10, v19, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
+; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v2, 16, v7
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v5
+; GCN-HSA-NEXT: v_bfe_i32 v12, v5, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50
-; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64:
@@ -8416,115 +8396,110 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[5:8], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[13:16], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[9:12], off, s[8:11], 0 offset:32
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[1:4], off, s[8:11], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[15:18], off, s[8:11], 0 offset:48
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v6, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2)
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v15, 0, 16
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v15, 0, 16
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v16
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v6, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v16, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v13
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v13, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v17
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v17, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v18, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v15
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2)
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v4, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:192
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v16, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v14
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:160
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v14, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v6, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4)
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v11
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v11
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:176
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v9, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v11, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:192
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v12
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v19, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v8
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v8
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v8, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v9, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v10
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v9, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v7, 0, 16
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v7, v10, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v8, 31, v7
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8)
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v7, v3, 0, 16
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v3, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v12
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v5, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v5
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v12, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v1, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v8, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v7, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v10, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v10
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v5, 0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v4
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v5, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v4, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v2, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v17, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v18, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v16, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v1, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v3, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v7, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 31, v29
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v4, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v6, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v19, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v7, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v20, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v3, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 31, v26
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 31, v28
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index f879dc660203f..81791632a398b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -5929,17 +5929,17 @@ define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v4i8_to_v4i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 24, v0
@@ -5952,8 +5952,8 @@ define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v4i8_to_v4i64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index bd191a37582c0..3d3f9b4148f22 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -5738,20 +5738,19 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out,
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: s_mov_b32 m0, -1
; SI-NEXT: ds_read_b64 v[0:1], v0
-; SI-NEXT: v_mov_b32_e32 v9, s0
+; SI-NEXT: v_mov_b32_e32 v8, s0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v1
-; SI-NEXT: v_bfe_i32 v3, v3, 0, 16
-; SI-NEXT: v_bfe_i32 v5, v0, 0, 16
-; SI-NEXT: v_bfe_i32 v7, v4, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
-; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7
-; SI-NEXT: ds_write2_b64 v9, v[3:4], v[1:2] offset0:2 offset1:3
-; SI-NEXT: ds_write2_b64 v9, v[5:6], v[7:8] offset1:1
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v1
+; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v1
+; SI-NEXT: v_bfe_i32 v4, v1, 0, 16
+; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT: v_bfe_i32 v6, v5, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; SI-NEXT: ds_write2_b64 v8, v[4:5], v[2:3] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v8, v[0:1], v[6:7] offset1:1
; SI-NEXT: s_endpgm
;
; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64:
@@ -5761,20 +5760,20 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out,
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
; VI-NO-DS128-NEXT: ds_read_b64 v[0:1], v0
-; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, s0
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; VI-NO-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3
-; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset1:1
+; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v7, v4, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v5, v2, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s0
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[1:2], v[7:8] offset0:2 offset1:3
+; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; VI-NO-DS128-NEXT: s_endpgm
;
; GFX9-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64:
@@ -5783,20 +5782,20 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out,
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NO-DS128-NEXT: ds_read_b64 v[0:1], v0
-; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s0
+; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, s0
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset1:1
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v7, v4, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v2, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v9, v[1:2], v[7:8] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v9, v[3:4], v[5:6] offset1:1
; GFX9-NO-DS128-NEXT: s_endpgm
;
; EG-LABEL: local_sextload_v4i16_to_v4i64:
@@ -5850,22 +5849,21 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out,
; VI-DS128-NEXT: s_mov_b32 m0, -1
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
-; VI-DS128-NEXT: ds_read_b64 v[0:1], v0
+; VI-DS128-NEXT: ds_read_b64 v[1:2], v0
; VI-DS128-NEXT: v_mov_b32_e32 v8, s0
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT: v_mov_b32_e32 v3, v1
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; VI-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v0, v2, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v4, v1, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v6, v3, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:16
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3]
+; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16
+; VI-DS128-NEXT: ds_write_b128 v8, v[4:7]
; VI-DS128-NEXT: s_endpgm
;
; GFX9-DS128-LABEL: local_sextload_v4i16_to_v4i64:
@@ -5873,22 +5871,21 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out,
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-DS128-NEXT: ds_read_b64 v[0:1], v0
+; GFX9-DS128-NEXT: ds_read_b64 v[1:2], v0
; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v0, v2, 0, 16
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT: v_bfe_i32 v4, v1, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v6, v3, 0, 16
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:16
-; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3]
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7]
; GFX9-DS128-NEXT: s_endpgm
%load = load <4 x i16>, ptr addrspace(3) %in
%ext = sext <4 x i16> %load to <4 x i64>
@@ -6144,9 +6141,8 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out,
; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
; SI-NEXT: v_mov_b32_e32 v16, s0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, v3
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v1
; SI-NEXT: v_ashrrev_i32_e32 v4, 16, v1
; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v3
@@ -6154,9 +6150,9 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out,
; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
; SI-NEXT: v_bfe_i32 v8, v1, 0, 16
; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
-; SI-NEXT: v_bfe_i32 v10, v9, 0, 16
+; SI-NEXT: v_bfe_i32 v10, v3, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; SI-NEXT: v_bfe_i32 v12, v12, 0, 16
+; SI-NEXT: v_bfe_i32 v12, v9, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; SI-NEXT: v_bfe_i32 v14, v11, 0, 16
@@ -6178,14 +6174,13 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out,
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s0
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v3
-; VI-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16
-; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v3
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; VI-NO-DS128-NEXT: v_bfe_i32 v10, v10, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v2, v3, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v6, v5, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v8, v7, 0, 16
@@ -6213,14 +6208,13 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out,
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, s0
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16
-; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v3
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v3, 0, 16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v5, 0, 16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v7, 0, 16
@@ -6338,14 +6332,13 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out,
; VI-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16
; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; VI-DS128-NEXT: v_bfe_i32 v6, v0, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; VI-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16
-; VI-DS128-NEXT: v_mov_b32_e32 v0, v3
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; VI-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16
; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
@@ -6372,15 +6365,14 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out,
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v6, v0, 0, 16
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX9-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16
-; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v3
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GFX9-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
@@ -6820,18 +6812,16 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out
; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1
; SI-NEXT: v_mov_b32_e32 v18, s0
; SI-NEXT: s_waitcnt lgkmcnt(1)
-; SI-NEXT: v_mov_b32_e32 v12, v3
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v14, v7
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4
; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v5
; SI-NEXT: v_ashrrev_i32_e32 v8, 16, v5
; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v3
; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v3
-; SI-NEXT: v_bfe_i32 v12, v12, 0, 16
+; SI-NEXT: v_bfe_i32 v12, v3, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:14 offset1:15
; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v1
@@ -6841,7 +6831,7 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out
; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:10 offset1:11
; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v7
; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v7
-; SI-NEXT: v_bfe_i32 v12, v14, 0, 16
+; SI-NEXT: v_bfe_i32 v12, v7, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:6 offset1:7
; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
@@ -6849,10 +6839,10 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out
; SI-NEXT: v_bfe_i32 v5, v6, 0, 16
; SI-NEXT: v_bfe_i32 v10, v0, 0, 16
; SI-NEXT: v_bfe_i32 v7, v2, 0, 16
-; SI-NEXT: v_bfe_i32 v12, v19, 0, 16
+; SI-NEXT: v_bfe_i32 v12, v17, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; SI-NEXT: v_bfe_i32 v14, v17, 0, 16
+; SI-NEXT: v_bfe_i32 v14, v14, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; SI-NEXT: v_bfe_i32 v16, v16, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
@@ -6898,38 +6888,36 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v7
; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v7
-; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, v7
-; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16
-; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, v3
+; VI-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v0
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2
-; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15
-; VI-NO-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16
+; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[14:15] offset0:14 offset1:15
+; VI-NO-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v2, v3, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v12, v11, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7
-; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5
-; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3
-; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[8:9] offset1:1
+; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[4:5] offset0:6 offset1:7
+; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[12:13] offset0:4 offset1:5
+; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[10:11] offset0:2 offset1:3
+; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[8:9] offset1:1
; VI-NO-DS128-NEXT: s_endpgm
;
; GFX9-NO-DS128-LABEL: local_sextload_v16i16_to_v16i64:
@@ -6960,38 +6948,36 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v7
; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v7
-; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, v7
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16
-; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v18, v3
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v0
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[14:15] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v3, 0, 16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v11, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[8:9] offset1:1
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[4:5] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[12:13] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[10:11] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[8:9] offset1:1
; GFX9-NO-DS128-NEXT: s_endpgm
;
; EG-LABEL: local_sextload_v16i16_to_v16i64:
@@ -7170,124 +7156,120 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-DS128-NEXT: s_mov_b32 m0, -1
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
-; VI-DS128-NEXT: ds_read_b128 v[3:6], v0
-; VI-DS128-NEXT: ds_read_b128 v[7:10], v0 offset:16
+; VI-DS128-NEXT: v_mov_b32_e32 v4, s1
+; VI-DS128-NEXT: ds_read_b128 v[0:3], v4
+; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16
+; VI-DS128-NEXT: v_mov_b32_e32 v16, s0
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; VI-DS128-NEXT: v_mov_b32_e32 v18, v6
+; VI-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT: v_bfe_i32 v11, v8, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-DS128-NEXT: v_bfe_i32 v13, v8, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; VI-DS128-NEXT: v_mov_b32_e32 v8, s0
-; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:80
-; VI-DS128-NEXT: v_bfe_i32 v11, v7, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16
-; VI-DS128-NEXT: v_mov_b32_e32 v15, v10
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10
-; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:64
-; VI-DS128-NEXT: v_bfe_i32 v11, v15, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v9
-; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:112
-; VI-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6
+; VI-DS128-NEXT: v_bfe_i32 v12, v5, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; VI-DS128-NEXT: v_bfe_i32 v14, v5, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v10, v0, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v4
+; VI-DS128-NEXT: ds_write_b128 v16, v[12:15] offset:80
+; VI-DS128-NEXT: v_bfe_i32 v12, v4, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v7
+; VI-DS128-NEXT: ds_write_b128 v16, v[12:15] offset:64
+; VI-DS128-NEXT: v_bfe_i32 v12, v7, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; VI-DS128-NEXT: ds_write_b128 v16, v[12:15] offset:112
+; VI-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; VI-DS128-NEXT: v_bfe_i32 v4, v6, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v6, v0, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; VI-DS128-NEXT: ds_write_b128 v16, v[4:7] offset:96
+; VI-DS128-NEXT: v_bfe_i32 v4, v2, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; VI-DS128-NEXT: v_bfe_i32 v10, v4, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; VI-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:96
-; VI-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v16, v19, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v6, v0, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v12, v4, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; VI-DS128-NEXT: v_bfe_i32 v12, v1, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; VI-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:48
-; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32
-; VI-DS128-NEXT: ds_write_b128 v8, v[10:13] offset:16
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3]
+; VI-DS128-NEXT: ds_write_b128 v16, v[0:3] offset:48
+; VI-DS128-NEXT: ds_write_b128 v16, v[4:7] offset:32
+; VI-DS128-NEXT: ds_write_b128 v16, v[12:15] offset:16
+; VI-DS128-NEXT: ds_write_b128 v16, v[8:11]
; VI-DS128-NEXT: s_endpgm
;
; GFX9-DS128-LABEL: local_sextload_v16i16_to_v16i64:
; GFX9-DS128: ; %bb.0:
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-DS128-NEXT: ds_read_b128 v[3:6], v0
-; GFX9-DS128-NEXT: ds_read_b128 v[7:10], v0 offset:16
+; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s1
+; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v4
+; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16
+; GFX9-DS128-NEXT: v_mov_b32_e32 v16, s0
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX9-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v8
-; GFX9-DS128-NEXT: v_bfe_i32 v11, v8, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v13, v3, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0
-; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:80
-; GFX9-DS128-NEXT: v_bfe_i32 v11, v7, 0, 16
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX9-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16
-; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v10
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10
-; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:64
-; GFX9-DS128-NEXT: v_bfe_i32 v11, v15, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v9
-; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:112
-; GFX9-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
-; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v6
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-DS128-NEXT: v_bfe_i32 v12, v5, 0, 16
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-DS128-NEXT: v_bfe_i32 v14, v5, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v10, v0, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GFX9-DS128-NEXT: v_bfe_i32 v10, v4, 0, 16
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; GFX9-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:96
-; GFX9-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v16, v19, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v4
+; GFX9-DS128-NEXT: ds_write_b128 v16, v[12:15] offset:80
; GFX9-DS128-NEXT: v_bfe_i32 v12, v4, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v7
+; GFX9-DS128-NEXT: ds_write_b128 v16, v[12:15] offset:64
+; GFX9-DS128-NEXT: v_bfe_i32 v12, v7, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX9-DS128-NEXT: ds_write_b128 v16, v[12:15] offset:112
+; GFX9-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX9-DS128-NEXT: v_bfe_i32 v4, v6, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v6, v0, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT: ds_write_b128 v16, v[4:7] offset:96
+; GFX9-DS128-NEXT: v_bfe_i32 v4, v2, 0, 16
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX9-DS128-NEXT: v_bfe_i32 v6, v0, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v12, v1, 0, 16
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX9-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:48
-; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32
-; GFX9-DS128-NEXT: ds_write_b128 v8, v[10:13] offset:16
-; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3]
+; GFX9-DS128-NEXT: ds_write_b128 v16, v[0:3] offset:48
+; GFX9-DS128-NEXT: ds_write_b128 v16, v[4:7] offset:32
+; GFX9-DS128-NEXT: ds_write_b128 v16, v[12:15] offset:16
+; GFX9-DS128-NEXT: ds_write_b128 v16, v[8:11]
; GFX9-DS128-NEXT: s_endpgm
%load = load <16 x i16>, ptr addrspace(3) %in
%ext = sext <16 x i16> %load to <16 x i64>
@@ -8034,107 +8016,103 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v12, s1
; SI-NEXT: s_mov_b32 m0, -1
-; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
+; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:2 offset1:3
; SI-NEXT: ds_read2_b64 v[0:3], v12 offset1:1
-; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:6 offset1:7
+; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:6 offset1:7
; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:4 offset1:5
; SI-NEXT: s_waitcnt lgkmcnt(3)
-; SI-NEXT: v_mov_b32_e32 v18, v7
-; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v7
-; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v7
-; SI-NEXT: v_bfe_i32 v18, v18, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v11
+; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v11
+; SI-NEXT: v_bfe_i32 v18, v11, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; SI-NEXT: v_mov_b32_e32 v7, s0
-; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:14 offset1:15
-; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v5
-; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v5
-; SI-NEXT: v_bfe_i32 v18, v5, 0, 16
+; SI-NEXT: v_mov_b32_e32 v11, s0
+; SI-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:14 offset1:15
+; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v9
+; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v9
+; SI-NEXT: v_bfe_i32 v18, v9, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:10 offset1:11
+; SI-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:10 offset1:11
; SI-NEXT: s_waitcnt lgkmcnt(4)
-; SI-NEXT: v_mov_b32_e32 v5, v3
; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v3
; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v3
-; SI-NEXT: v_bfe_i32 v18, v5, 0, 16
+; SI-NEXT: v_bfe_i32 v18, v3, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:6 offset1:7
+; SI-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:6 offset1:7
; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v1
; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v1
; SI-NEXT: v_bfe_i32 v18, v1, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:2 offset1:3
; SI-NEXT: s_waitcnt lgkmcnt(5)
-; SI-NEXT: v_mov_b32_e32 v1, v11
-; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v11
-; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v11
-; SI-NEXT: v_bfe_i32 v18, v1, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v7
+; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v7
+; SI-NEXT: v_bfe_i32 v18, v7, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:30 offset1:31
-; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v9
-; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v9
-; SI-NEXT: v_bfe_i32 v18, v9, 0, 16
+; SI-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:30 offset1:31
+; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v5
+; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v5
+; SI-NEXT: v_bfe_i32 v18, v5, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:26 offset1:27
+; SI-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:26 offset1:27
; SI-NEXT: s_waitcnt lgkmcnt(6)
-; SI-NEXT: v_mov_b32_e32 v1, v15
-; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v15
-; SI-NEXT: v_bfe_i32 v17, v1, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; SI-NEXT: ds_write2_b64 v7, v[17:18], v[15:16] offset0:22 offset1:23
+; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v15
+; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v15
+; SI-NEXT: v_bfe_i32 v18, v15, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; SI-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:22 offset1:23
; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v13
; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v13
; SI-NEXT: v_bfe_i32 v17, v13, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; SI-NEXT: ds_write2_b64 v7, v[17:18], v[15:16] offset0:18 offset1:19
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; SI-NEXT: v_bfe_i32 v5, v6, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; SI-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:18 offset1:19
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10
+; SI-NEXT: v_bfe_i32 v9, v10, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; SI-NEXT: v_bfe_i32 v15, v1, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; SI-NEXT: ds_write2_b64 v7, v[5:6], v[15:16] offset0:12 offset1:13
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10
-; SI-NEXT: v_bfe_i32 v3, v4, 0, 16
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8
-; SI-NEXT: v_bfe_i32 v5, v1, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
-; SI-NEXT: ds_write2_b64 v7, v[3:4], v[5:6] offset0:8 offset1:9
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v12
+; SI-NEXT: ds_write2_b64 v11, v[9:10], v[15:16] offset0:12 offset1:13
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v6
+; SI-NEXT: v_bfe_i32 v7, v8, 0, 16
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4
+; SI-NEXT: v_bfe_i32 v9, v1, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
+; SI-NEXT: ds_write2_b64 v11, v[7:8], v[9:10] offset0:8 offset1:9
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12
; SI-NEXT: v_bfe_i32 v1, v12, 0, 16
; SI-NEXT: v_bfe_i32 v3, v14, 0, 16
-; SI-NEXT: v_bfe_i32 v5, v8, 0, 16
-; SI-NEXT: v_bfe_i32 v8, v10, 0, 16
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0
-; SI-NEXT: v_bfe_i32 v9, v0, 0, 16
-; SI-NEXT: v_bfe_i32 v10, v2, 0, 16
-; SI-NEXT: v_bfe_i32 v12, v11, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
+; SI-NEXT: v_bfe_i32 v5, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v7, v6, 0, 16
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0
+; SI-NEXT: v_bfe_i32 v8, v0, 0, 16
+; SI-NEXT: v_bfe_i32 v9, v2, 0, 16
+; SI-NEXT: v_bfe_i32 v12, v13, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
-; SI-NEXT: ds_write2_b64 v7, v[10:11], v[12:13] offset0:4 offset1:5
-; SI-NEXT: v_bfe_i32 v11, v6, 0, 16
+; SI-NEXT: ds_write2_b64 v11, v[9:10], v[12:13] offset0:4 offset1:5
+; SI-NEXT: v_bfe_i32 v12, v18, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; SI-NEXT: v_bfe_i32 v13, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v14, v17, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; SI-NEXT: v_bfe_i32 v15, v15, 0, 16
+; SI-NEXT: v_bfe_i32 v16, v16, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
-; SI-NEXT: v_bfe_i32 v16, v14, 0, 16
-; SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; SI-NEXT: ds_write2_b64 v7, v[9:10], v[16:17] offset1:1
-; SI-NEXT: v_bfe_i32 v17, v18, 0, 16
+; SI-NEXT: v_bfe_i32 v17, v19, 0, 16
; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; SI-NEXT: ds_write2_b64 v7, v[8:9], v[17:18] offset0:28 offset1:29
-; SI-NEXT: ds_write2_b64 v7, v[5:6], v[15:16] offset0:24 offset1:25
-; SI-NEXT: ds_write2_b64 v7, v[3:4], v[13:14] offset0:20 offset1:21
-; SI-NEXT: ds_write2_b64 v7, v[1:2], v[11:12] offset0:16 offset1:17
+; SI-NEXT: ds_write2_b64 v11, v[8:9], v[17:18] offset1:1
+; SI-NEXT: v_bfe_i32 v9, v15, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
+; SI-NEXT: ds_write2_b64 v11, v[7:8], v[9:10] offset0:28 offset1:29
+; SI-NEXT: ds_write2_b64 v11, v[5:6], v[16:17] offset0:24 offset1:25
+; SI-NEXT: ds_write2_b64 v11, v[3:4], v[14:15] offset0:20 offset1:21
+; SI-NEXT: ds_write2_b64 v11, v[1:2], v[12:13] offset0:16 offset1:17
; SI-NEXT: s_endpgm
;
; VI-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64:
@@ -8142,19 +8120,19 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, s1
-; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v7 offset0:6 offset1:7
-; VI-NO-DS128-NEXT: ds_read2_b64 v[12:15], v7 offset0:4 offset1:5
+; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1
+; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:6 offset1:7
+; VI-NO-DS128-NEXT: ds_read2_b64 v[12:15], v4 offset0:4 offset1:5
; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, s0
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; VI-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v18, v3, 0, 16
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; VI-NO-DS128-NEXT: v_bfe_i32 v16, v3, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; VI-NO-DS128-NEXT: ds_read2_b64 v[3:6], v7 offset0:2 offset1:3
-; VI-NO-DS128-NEXT: ds_read2_b64 v[7:10], v7 offset1:1
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:30 offset1:31
+; VI-NO-DS128-NEXT: ds_read2_b64 v[7:10], v4 offset0:2 offset1:3
+; VI-NO-DS128-NEXT: ds_read2_b64 v[3:6], v4 offset1:1
+; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[16:17], v[18:19] offset0:30 offset1:31
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v2
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v18, v2, 0, 16
@@ -8186,87 +8164,86 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v13
+; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(5)
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v4
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:20 offset1:21
; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v13, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v5
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[16:17], v[14:15] offset0:18 offset1:19
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v12
-; VI-NO-DS128-NEXT: v_bfe_i32 v15, v12, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v17, v16, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v12
+; VI-NO-DS128-NEXT: v_bfe_i32 v17, v12, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[15:16], v[17:18] offset0:16 offset1:17
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(9)
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v6
-; VI-NO-DS128-NEXT: v_bfe_i32 v15, v15, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v17, v6, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[19:20] offset0:16 offset1:17
+; VI-NO-DS128-NEXT: v_bfe_i32 v17, v0, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v19, v10, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:14 offset1:15
-; VI-NO-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v4
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
-; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(9)
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[5:6], v[15:16] offset0:12 offset1:13
-; VI-NO-DS128-NEXT: v_bfe_i32 v5, v12, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[15:16], v[5:6] offset0:10 offset1:11
-; VI-NO-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v3
-; VI-NO-DS128-NEXT: v_bfe_i32 v17, v3, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v10
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21
+; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[19:20], v[17:18] offset0:14 offset1:15
+; VI-NO-DS128-NEXT: v_bfe_i32 v17, v0, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v9, v9, 0, 16
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v7
-; VI-NO-DS128-NEXT: v_bfe_i32 v19, v19, 0, 16
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[21:22] offset0:8 offset1:9
-; VI-NO-DS128-NEXT: v_bfe_i32 v17, v10, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v4, v7, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v6, v8, 0, 16
-; VI-NO-DS128-NEXT: v_bfe_i32 v8, v9, 0, 16
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8
+; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[9:10], v[17:18] offset0:12 offset1:13
+; VI-NO-DS128-NEXT: v_bfe_i32 v9, v0, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v17, v8, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v7
+; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[9:10] offset0:10 offset1:11
+; VI-NO-DS128-NEXT: v_bfe_i32 v17, v0, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v19, v7, 0, 16
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v6
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; VI-NO-DS128-NEXT: v_bfe_i32 v8, v12, 0, 16
+; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[19:20], v[17:18] offset0:8 offset1:9
+; VI-NO-DS128-NEXT: v_bfe_i32 v17, v5, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v5, v6, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; VI-NO-DS128-NEXT: v_bfe_i32 v21, v3, 0, 16
+; VI-NO-DS128-NEXT: v_bfe_i32 v3, v4, 0, 16
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[19:20] offset0:6 offset1:7
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[8:9], v[15:16] offset0:4 offset1:5
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[6:7], v[13:14] offset0:2 offset1:3
-; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[4:5], v[1:2] offset1:1
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[5:6], v[8:9] offset0:6 offset1:7
+; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:4 offset1:5
+; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[3:4], v[13:14] offset0:2 offset1:3
+; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[21:22], v[1:2] offset1:1
; VI-NO-DS128-NEXT: s_endpgm
;
; GFX9-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64:
; GFX9-NO-DS128: ; %bb.0:
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s1
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v8 offset0:6 offset1:7
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v8 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v11, s1
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v11 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v11 offset0:4 offset1:5
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v15, s0
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v7
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v9, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v7, 0, 16
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v8, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[11:14], v8 offset1:1
-; GFX9-NO-DS128-NEXT: ds_read2_b64 v[7:10], v8 offset0:2 offset1:3
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:30 offset1:31
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[7:10], v11 offset1:1
+; GFX9-NO-DS128-NEXT: ds_read2_b64 v[11:14], v11 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:30 offset1:31
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v6, 0, 16
@@ -8303,63 +8280,62 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v1, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[2:3] offset0:18 offset1:19
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v0
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v17, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[16:17] offset0:16 offset1:17
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:16 offset1:17
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(8)
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v10, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v14
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v14, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[3:4] offset0:14 offset1:15
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v9
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v9, v9, 0, 16
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[9:10], v[3:4] offset0:12 offset1:13
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v12
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[8:9], v[3:4] offset0:10 offset1:11
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v20, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v4, 0, 16
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v13
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v14
-; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, v14
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v13
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v14, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[13:14], v[16:17] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v12, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v11
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v12, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v18, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v19, 0, 16
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[20:21] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v0, 0, 16
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[13:14] offset0:10 offset1:11
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v11, 0, 16
-; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v13, 0, 16
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v11, 0, 16
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v8
+; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v9
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v9, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v9, v10, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v20, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v7, 0, 16
+; GFX9-NO-DS128-NEXT: v_bfe_i32 v7, v8, 0, 16
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:6 offset1:7
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[10:11], v[12:13] offset0:4 offset1:5
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[8:9], v[1:2] offset0:2 offset1:3
-; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[5:6] offset1:1
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[9:10], v[12:13] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[3:4] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[7:8], v[1:2] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[20:21], v[5:6] offset1:1
; GFX9-NO-DS128-NEXT: s_endpgm
;
; EG-LABEL: local_sextload_v32i16_to_v32i64:
@@ -8710,231 +8686,223 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-DS128-NEXT: s_mov_b32 m0, -1
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT: v_mov_b32_e32 v4, s1
-; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 offset:48
-; VI-DS128-NEXT: ds_read_b128 v[9:12], v4 offset:32
-; VI-DS128-NEXT: v_mov_b32_e32 v8, s0
-; VI-DS128-NEXT: ds_read_b128 v[17:20], v4 offset:16
-; VI-DS128-NEXT: ds_read_b128 v[4:7], v4
-; VI-DS128-NEXT: s_waitcnt lgkmcnt(3)
-; VI-DS128-NEXT: v_bfe_i32 v13, v2, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16
+; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT: ds_read_b128 v[5:8], v0 offset:48
+; VI-DS128-NEXT: ds_read_b128 v[9:12], v0 offset:32
+; VI-DS128-NEXT: v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v15, v1, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-DS128-NEXT: v_mov_b32_e32 v2, v3
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:224
-; VI-DS128-NEXT: v_bfe_i32 v13, v2, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v15, v3, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v8
+; VI-DS128-NEXT: ds_read_b128 v[17:20], v0 offset:16
+; VI-DS128-NEXT: ds_read_b128 v[0:3], v0
+; VI-DS128-NEXT: ds_write_b128 v4, v[13:16] offset:224
+; VI-DS128-NEXT: v_bfe_i32 v15, v7, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v13, v8, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:240
-; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; VI-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:208
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; VI-DS128-NEXT: ds_write_b128 v4, v[13:16] offset:240
+; VI-DS128-NEXT: v_bfe_i32 v15, v7, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; VI-DS128-NEXT: v_bfe_i32 v13, v5, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v5, v6, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v7, v7, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; VI-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:208
; VI-DS128-NEXT: s_waitcnt lgkmcnt(5)
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v11
-; VI-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:192
-; VI-DS128-NEXT: v_mov_b32_e32 v13, v12
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:160
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v12
-; VI-DS128-NEXT: v_bfe_i32 v0, v13, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:176
-; VI-DS128-NEXT: v_bfe_i32 v0, v9, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v9
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v11
+; VI-DS128-NEXT: v_bfe_i32 v5, v11, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v7, v7, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; VI-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:160
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12
+; VI-DS128-NEXT: v_bfe_i32 v5, v12, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v7, v7, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; VI-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:176
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; VI-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v7, v7, 0, 16
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; VI-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:128
+; VI-DS128-NEXT: s_waitcnt lgkmcnt(6)
+; VI-DS128-NEXT: v_bfe_i32 v5, v1, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; VI-DS128-NEXT: v_bfe_i32 v7, v1, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v10
; VI-DS128-NEXT: v_bfe_i32 v9, v10, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v11, v11, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v11, v1, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:144
-; VI-DS128-NEXT: s_waitcnt lgkmcnt(8)
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v19
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v19
+; VI-DS128-NEXT: ds_write_b128 v4, v[9:12] offset:144
; VI-DS128-NEXT: v_bfe_i32 v9, v19, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v11, v11, 0, 16
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:128
-; VI-DS128-NEXT: s_waitcnt lgkmcnt(8)
-; VI-DS128-NEXT: v_bfe_i32 v0, v5, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; VI-DS128-NEXT: v_mov_b32_e32 v5, v20
+; VI-DS128-NEXT: v_bfe_i32 v11, v1, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:96
-; VI-DS128-NEXT: v_bfe_i32 v9, v5, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v20
-; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v20
+; VI-DS128-NEXT: ds_write_b128 v4, v[9:12] offset:96
+; VI-DS128-NEXT: v_bfe_i32 v9, v20, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v11, v1, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v17
-; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:112
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v17
+; VI-DS128-NEXT: ds_write_b128 v4, v[9:12] offset:112
; VI-DS128-NEXT: v_bfe_i32 v9, v17, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v11, v1, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v18
-; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:64
-; VI-DS128-NEXT: v_bfe_i32 v9, v4, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; VI-DS128-NEXT: ds_write_b128 v4, v[9:12] offset:64
+; VI-DS128-NEXT: v_bfe_i32 v9, v0, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT: v_bfe_i32 v11, v0, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; VI-DS128-NEXT: ds_write_b128 v4, v[13:16] offset:192
; VI-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v15, v5, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v11, v4, 0, 16
-; VI-DS128-NEXT: v_mov_b32_e32 v4, v7
+; VI-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:80
-; VI-DS128-NEXT: v_bfe_i32 v13, v4, 0, 16
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; VI-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v4, v6, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16
-; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; VI-DS128-NEXT: ds_write_b128 v4, v[13:16] offset:80
+; VI-DS128-NEXT: v_bfe_i32 v13, v3, 0, 16
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; VI-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v0, v2, 0, 16
+; VI-DS128-NEXT: v_bfe_i32 v2, v3, 0, 16
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7
; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32
-; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:48
-; VI-DS128-NEXT: ds_write_b128 v8, v[9:12]
-; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16
+; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:32
+; VI-DS128-NEXT: ds_write_b128 v4, v[13:16] offset:48
+; VI-DS128-NEXT: ds_write_b128 v4, v[9:12]
+; VI-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:16
; VI-DS128-NEXT: s_endpgm
;
; GFX9-DS128-LABEL: local_sextload_v32i16_to_v32i64:
; GFX9-DS128: ; %bb.0:
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DS128-NEXT: v_mov_b32_e32 v13, s1
-; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v13 offset:48
-; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v13 offset:32
-; GFX9-DS128-NEXT: v_mov_b32_e32 v12, s0
-; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v13
-; GFX9-DS128-NEXT: ds_read_b128 v[18:21], v13 offset:16
-; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3)
-; GFX9-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-DS128-NEXT: v_bfe_i32 v16, v6, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GFX9-DS128-NEXT: v_mov_b32_e32 v6, v7
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX9-DS128-NEXT: ds_write_b128 v12, v[14:17] offset:224
-; GFX9-DS128-NEXT: v_bfe_i32 v13, v6, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v15, v7, 0, 16
+; GFX9-DS128-NEXT: v_mov_b32_e32 v17, s1
+; GFX9-DS128-NEXT: ds_read_b128 v[9:12], v17 offset:48
+; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v17 offset:32
+; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0
+; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v11
+; GFX9-DS128-NEXT: v_bfe_i32 v13, v11, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v4
-; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:240
-; GFX9-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5
-; GFX9-DS128-NEXT: v_bfe_i32 v13, v4, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:208
-; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(5)
-; GFX9-DS128-NEXT: v_bfe_i32 v4, v2, 0, 16
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v12
+; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v17
+; GFX9-DS128-NEXT: ds_read_b128 v[17:20], v17 offset:16
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:224
+; GFX9-DS128-NEXT: v_bfe_i32 v13, v11, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v11, v12, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GFX9-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16
-; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:192
-; GFX9-DS128-NEXT: v_mov_b32_e32 v13, v3
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:160
-; GFX9-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:176
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX9-DS128-NEXT: v_bfe_i32 v2, v0, 0, 16
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-DS128-NEXT: v_bfe_i32 v13, v1, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:240
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v9
+; GFX9-DS128-NEXT: v_bfe_i32 v13, v11, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v11, v9, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:192
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX9-DS128-NEXT: v_bfe_i32 v9, v10, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:208
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(6)
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v20
+; GFX9-DS128-NEXT: v_bfe_i32 v9, v2, 0, 16
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-DS128-NEXT: v_bfe_i32 v11, v2, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:160
+; GFX9-DS128-NEXT: v_bfe_i32 v9, v3, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v11, v2, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:176
+; GFX9-DS128-NEXT: v_bfe_i32 v9, v0, 0, 16
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-DS128-NEXT: v_bfe_i32 v11, v0, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(7)
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:128
+; GFX9-DS128-NEXT: v_bfe_i32 v11, v0, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9
-; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:144
-; GFX9-DS128-NEXT: v_bfe_i32 v13, v20, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v15, v1, 0, 16
-; GFX9-DS128-NEXT: ds_write_b128 v12, v[2:5] offset:128
-; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16
-; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v21
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:96
-; GFX9-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v21
-; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v18
-; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:112
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:144
+; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(8)
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v19
+; GFX9-DS128-NEXT: v_bfe_i32 v0, v19, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:96
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v20
+; GFX9-DS128-NEXT: v_bfe_i32 v0, v20, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:112
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v17
+; GFX9-DS128-NEXT: v_bfe_i32 v0, v17, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:64
+; GFX9-DS128-NEXT: v_bfe_i32 v0, v4, 0, 16
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v18
; GFX9-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v19
-; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:64
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8
-; GFX9-DS128-NEXT: v_bfe_i32 v13, v19, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v15, v1, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v6, v8, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16
-; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v11
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:80
-; GFX9-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11
-; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10
-; GFX9-DS128-NEXT: v_bfe_i32 v17, v10, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v2, v9, 0, 16
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:80
+; GFX9-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v4, v6, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v9, v5, 0, 16
+; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GFX9-DS128-NEXT: ds_write_b128 v12, v[17:20] offset:32
-; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:48
-; GFX9-DS128-NEXT: ds_write_b128 v12, v[6:9]
-; GFX9-DS128-NEXT: ds_write_b128 v12, v[2:5] offset:16
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:48
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3]
+; GFX9-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:16
; GFX9-DS128-NEXT: s_endpgm
%load = load <32 x i16>, ptr addrspace(3) %in
%ext = sext <32 x i16> %load to <32 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
new file mode 100644
index 0000000000000..4999c5de68087
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
@@ -0,0 +1,134 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# FIXME For now this needs dce after peephole-opt. Integrate elimination into pass?
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=peephole-opt,dce,si-peephole-sdwa -o - %s | FileCheck %s
+
+---
+name: sdwa_reg_sequence
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+
+ ; CHECK-LABEL: name: sdwa_reg_sequence
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 10, 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 20, 0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_U32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_ADD_U32_e64_1]], killed [[S_MOV_B32_]], implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_ADD_U32_e64_]], [[V_AND_B32_e64_]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY1]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = V_ADD_U32_e64 %0, 10, 0, implicit $exec
+ %2:vgpr_32 = V_ADD_U32_e64 %0, 20, 0, implicit $exec
+ %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
+ %5:sreg_32 = S_MOV_B32 255
+ %6:vgpr_32 = V_AND_B32_e64 killed %2, killed %5, implicit $exec
+ %7:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %3, %subreg.sub1
+ %8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7.sub0, 0, implicit $exec
+ %10:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed %9, 0, implicit $exec
+ %12:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %10, %subreg.sub1
+ %13:sreg_64 = IMPLICIT_DEF
+ %14:vreg_64 = COPY %13
+ GLOBAL_STORE_DWORDX2 killed %14, killed %12, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: sdwa_reg_sequence_composed_subregs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: sdwa_reg_sequence_composed_subregs
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]].sub0, 10, 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]].sub1, 20, 0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_U32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_ADD_U32_e64_1]], killed [[S_MOV_B32_]], implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], [[V_AND_B32_e64_]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vreg_64 = COPY $vgpr1_vgpr2
+ %1:vgpr_32 = V_ADD_U32_e64 %0.sub0, 10, 0, implicit $exec
+ %2:vgpr_32 = V_ADD_U32_e64 %0.sub1, 20, 0, implicit $exec
+ %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
+ %5:vreg_64 = REG_SEQUENCE %0.sub0, %subreg.sub0, %4.sub1, %subreg.sub1
+ %6:sreg_32 = S_MOV_B32 255
+ %7:vgpr_32 = V_AND_B32_e64 killed %2, killed %6, implicit $exec
+ %8:vreg_64 = REG_SEQUENCE %7, %subreg.sub0, %3, %subreg.sub1
+ %9:vgpr_32, %10:sreg_64_xexec = V_ADD_CO_U32_e64 %5.sub1, %8.sub0, 0, implicit $exec
+ %11:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed %10, 0, implicit $exec
+ %13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1
+ %15:vreg_64 = COPY %13
+ GLOBAL_STORE_DWORDX2 killed %15, killed %13, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ S_ENDPGM 0
+...
+
+
+---
+name: sdwa_reg_sequence_multiple_uses
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+
+ ; CHECK-LABEL: name: sdwa_reg_sequence_multiple_uses
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 10, 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 20, 0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_U32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_ADD_U32_e64_1]], killed [[S_MOV_B32_]], implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_ADD_U32_e64_]], [[V_AND_B32_e64_]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY1]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = V_ADD_U32_e64 %0, 10, 0, implicit $exec
+ %2:vgpr_32 = V_ADD_U32_e64 %0, 20, 0, implicit $exec
+ %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
+ %5:sreg_32 = S_MOV_B32 255
+ %6:vgpr_32 = V_AND_B32_e64 killed %2, killed %5, implicit $exec
+ %7:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %3, %subreg.sub1
+ %8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7.sub0, 0, implicit $exec
+ %10:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed %9, 0, implicit $exec
+ %12:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %10, %subreg.sub1
+ %13:sreg_64 = IMPLICIT_DEF
+ %14:vreg_64 = COPY %13
+ %15:vgpr_32 = COPY %6
+ GLOBAL_STORE_DWORDX2 killed %14, killed %12, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ S_ENDPGM 0
+...
More information about the llvm-commits
mailing list