[llvm-branch-commits] [llvm] WIP: AMDGPU: Implement getRegSequenceLikeInputs for v_pk_mov_b32 (PR #125657)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Feb 6 21:38:20 PST 2025
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/125657
>From 259f74ebb20b8ddf913ad36a1241902d753f1393 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 21 Jan 2025 16:11:40 +0700
Subject: [PATCH] WIP: AMDGPU: Implement getRegSequenceLikeInputs for
v_pk_mov_b32
In principle we need this analysis to avoid regressions when
using v_pk_mov_b32 when shuffling to physical register inputs. However,
as it stands this only introduces regressions by decomposing every
useful case where we benefit from the instruction.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 34 +
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 3 +
llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 2 +-
.../AMDGPU/reg-sequence-like-v-pk-mov-b32.mir | 28 +-
.../AMDGPU/shufflevector.v4f32.v2f32.ll | 210 +++--
.../AMDGPU/shufflevector.v4f32.v3f32.ll | 272 +++---
.../AMDGPU/shufflevector.v4f32.v4f32.ll | 809 ++++++++++--------
.../AMDGPU/shufflevector.v4i32.v2i32.ll | 210 +++--
.../AMDGPU/shufflevector.v4i32.v3i32.ll | 272 +++---
.../AMDGPU/shufflevector.v4i32.v4i32.ll | 809 ++++++++++--------
.../CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll | 210 +++--
.../CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll | 272 +++---
.../CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll | 809 ++++++++++--------
13 files changed, 2242 insertions(+), 1698 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 35667801c809d5c..3cae838321885d6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9725,6 +9725,40 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
return nullptr;
}
+bool SIInstrInfo::getRegSequenceLikeInputs(
+ const MachineInstr &MI, unsigned DefIdx,
+ SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const {
+ assert(MI.getOpcode() == AMDGPU::V_PK_MOV_B32 &&
+ "v_pk_mov_b32 is the only reg-sequence like instruction");
+ assert(DefIdx == 0);
+
+ unsigned Src0Mods = MI.getOperand(1).getImm();
+ const MachineOperand &Src0 = MI.getOperand(2);
+ unsigned Src1Mods = MI.getOperand(3).getImm();
+ const MachineOperand &Src1 = MI.getOperand(4);
+
+ unsigned SubReg0 =
+ Src0Mods & SISrcMods::OP_SEL_0 ? AMDGPU::sub1 : AMDGPU::sub0;
+ unsigned SubReg1 =
+ Src1Mods & SISrcMods::OP_SEL_0 ? AMDGPU::sub1 : AMDGPU::sub0;
+
+ if (!Src0.isUndef()) {
+ // src0 will provide the result sub0 from src0.
+ SubReg0 = RI.composeSubRegIndices(Src0.getSubReg(), SubReg0);
+ InputRegs.push_back(
+ RegSubRegPairAndIdx(Src0.getReg(), SubReg0, AMDGPU::sub0));
+ }
+
+ if (!Src1.isUndef()) {
+ // src1 will provide the result's sub1 from src1.
+ SubReg1 = RI.composeSubRegIndices(Src1.getSubReg(), SubReg1);
+ InputRegs.push_back(
+ RegSubRegPairAndIdx(Src1.getReg(), SubReg1, AMDGPU::sub1));
+ }
+
+ return true;
+}
+
unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
const MachineInstr &MI,
unsigned *PredCost) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 933935a86f9f98f..425ff77e8cdc3fb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1437,6 +1437,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
int FrameIndex,
LiveIntervals *LIS = nullptr,
VirtRegMap *VRM = nullptr) const override;
+ bool getRegSequenceLikeInputs(
+ const MachineInstr &MI, unsigned DefIdx,
+ SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const override;
unsigned getInstrLatency(const InstrItineraryData *ItinData,
const MachineInstr &MI,
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 1afd68767cd3ba6..a6f8035e93b182c 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1107,7 +1107,7 @@ let isCommutable = 1, isReMaterializable = 1 in {
defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
} // End SubtargetPredicate = HasPackedFP32Ops
- let SubtargetPredicate = HasPkMovB32 in
+ let SubtargetPredicate = HasPkMovB32, isRegSequence = 1 in
defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
} // End isCommutable = 1, isReMaterializable = 1
diff --git a/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir
index 90291221e8e178a..602fc29bf1db36f 100644
--- a/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir
+++ b/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir
@@ -15,8 +15,8 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; CHECK-NEXT: $vgpr4 = COPY [[COPY2]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY3]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5
@@ -49,8 +49,8 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; CHECK-NEXT: $vgpr4 = COPY [[COPY4]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY5]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5
@@ -118,7 +118,7 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; CHECK-NEXT: $vgpr4 = COPY [[COPY4]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY5]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5
@@ -154,7 +154,7 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
; CHECK-NEXT: $vgpr4 = COPY [[COPY4]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY5]]
@@ -191,7 +191,7 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, undef [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
; CHECK-NEXT: $vgpr4 = COPY [[COPY4]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY5]]
@@ -229,7 +229,7 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, undef [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; CHECK-NEXT: $vgpr4 = COPY [[COPY4]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY5]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5
@@ -304,8 +304,8 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]].sub0_sub1, 8, [[REG_SEQUENCE1]].sub0_sub1, 0, 0, 0, 0, 0, implicit $exec
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; CHECK-NEXT: $vgpr4 = COPY [[COPY6]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY7]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5
@@ -345,8 +345,8 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, [[REG_SEQUENCE]].sub0_sub1, 12, [[REG_SEQUENCE1]].sub0_sub1, 0, 0, 0, 0, 0, implicit $exec
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
; CHECK-NEXT: $vgpr4 = COPY [[COPY6]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY7]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5
@@ -388,8 +388,8 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]].sub2_sub3, 12, [[REG_SEQUENCE1]].sub0_sub1, 0, 0, 0, 0, 0, implicit $exec
- ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
- ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3
; CHECK-NEXT: $vgpr4 = COPY [[COPY8]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY9]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll
index 9e3c044a76295f0..fafdac7ffac62f2 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll
@@ -176,7 +176,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -191,8 +192,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -565,15 +566,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -581,15 +583,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -672,26 +675,31 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_3_2:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -865,7 +873,8 @@ define void @v_shuffle_v4f32_v2f32__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -878,7 +887,8 @@ define void @v_shuffle_v4f32_v2f32__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -962,7 +972,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -978,7 +989,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1126,13 +1138,14 @@ define void @v_shuffle_v4f32_v2f32__3_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1142,13 +1155,14 @@ define void @v_shuffle_v4f32_v2f32__3_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1289,15 +1303,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1305,15 +1320,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1707,7 +1723,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1723,7 +1740,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1758,13 +1776,14 @@ define void @v_shuffle_v4f32_v2f32__3_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1774,13 +1793,14 @@ define void @v_shuffle_v4f32_v2f32__3_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -2132,7 +2152,8 @@ define void @v_shuffle_v4f32_v2f32__3_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -2145,7 +2166,8 @@ define void @v_shuffle_v4f32_v2f32__3_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -2224,15 +2246,16 @@ define void @v_shuffle_v4f32_v2f32__3_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2240,15 +2263,16 @@ define void @v_shuffle_v4f32_v2f32__3_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -2492,13 +2516,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2508,13 +2534,15 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -2798,15 +2826,16 @@ define void @v_shuffle_v4f32_v2f32__3_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2814,15 +2843,16 @@ define void @v_shuffle_v4f32_v2f32__3_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -2908,7 +2938,8 @@ define void @v_shuffle_v4f32_v2f32__3_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -2921,7 +2952,8 @@ define void @v_shuffle_v4f32_v2f32__3_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll
index 31c458f5338cb99..c797ef338dee96d 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll
@@ -1280,7 +1280,8 @@ define void @v_shuffle_v4f32_v3f32__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
@@ -1295,7 +1296,8 @@ define void @v_shuffle_v4f32_v3f32__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
@@ -1433,7 +1435,8 @@ define void @v_shuffle_v4f32_v3f32__4_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
@@ -1450,8 +1453,8 @@ define void @v_shuffle_v4f32_v3f32__4_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
@@ -1961,14 +1964,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1978,15 +1982,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v5, 0
+; GFX940-NEXT: v_mov_b32_e32 v7, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:4]
+; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v2
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2138,16 +2142,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v9, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_mov_b32_e32 v2, v7
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2155,17 +2160,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:2]
+; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v5, 0
+; GFX940-NEXT: v_mov_b32_e32 v9, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:4]
+; GFX940-NEXT: ; def v[6:8]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v0, v8
+; GFX940-NEXT: v_mov_b32_e32 v1, v8
+; GFX940-NEXT: v_mov_b32_e32 v2, v7
+; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3272,9 +3277,9 @@ define void @v_shuffle_v4f32_v3f32__1_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -3287,8 +3292,8 @@ define void @v_shuffle_v4f32_v3f32__1_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -3416,12 +3421,12 @@ define void @v_shuffle_v4f32_v3f32__4_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -3434,12 +3439,12 @@ define void @v_shuffle_v4f32_v3f32__4_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v7, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -3986,17 +3991,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4004,17 +4009,18 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:2]
+; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v7, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
+; GFX940-NEXT: v_mov_b32_e32 v5, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:6]
+; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v6
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v0, v2
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4106,17 +4112,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_mov_b32_e32 v2, v7
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4124,18 +4130,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:2]
+; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v7, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
+; GFX940-NEXT: v_mov_b32_e32 v5, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:6]
+; GFX940-NEXT: ; def v[6:8]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v6
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v0, v8
+; GFX940-NEXT: v_mov_b32_e32 v1, v8
+; GFX940-NEXT: v_mov_b32_e32 v2, v7
+; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4312,7 +4317,8 @@ define void @v_shuffle_v4f32_v3f32__4_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
@@ -4327,7 +4333,8 @@ define void @v_shuffle_v4f32_v3f32__4_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
@@ -4851,16 +4858,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4868,17 +4876,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_3(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:2]
+; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v5, 0
+; GFX940-NEXT: v_mov_b32_e32 v7, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:4]
+; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4964,28 +4972,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_4_3:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:4]
+; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v5, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -6083,9 +6094,9 @@ define void @v_shuffle_v4f32_v3f32__1_5_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6102,8 +6113,8 @@ define void @v_shuffle_v4f32_v3f32__1_5_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -6241,9 +6252,9 @@ define void @v_shuffle_v4f32_v3f32__4_5_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6256,8 +6267,8 @@ define void @v_shuffle_v4f32_v3f32__4_5_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -6716,16 +6727,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v6
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6733,17 +6745,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:2]
+; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v5, 0
+; GFX940-NEXT: v_mov_b32_e32 v7, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:4]
+; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: v_mov_b32_e32 v3, v6
+; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -6881,10 +6893,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6897,8 +6910,9 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v5, 0
; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll
index e3427cd35c683af..40d6ad90ab34b2c 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll
@@ -343,12 +343,13 @@ define void @v_shuffle_v4f32_v4f32__7_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -363,8 +364,9 @@ define void @v_shuffle_v4f32_v4f32__7_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -454,7 +456,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -469,8 +472,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -557,8 +560,9 @@ define void @v_shuffle_v4f32_v4f32__7_4_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -569,8 +573,9 @@ define void @v_shuffle_v4f32_v4f32__7_4_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -644,7 +649,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -656,7 +662,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -1162,10 +1169,11 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1180,10 +1188,11 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v5
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1278,16 +1287,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v7
+; GFX90A-NEXT: v_mov_b32_e32 v6, v7
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1295,17 +1305,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v7
+; GFX940-NEXT: v_mov_b32_e32 v6, v7
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1392,26 +1402,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_7_4:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1493,26 +1508,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_7_6:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1689,7 +1709,8 @@ define void @v_shuffle_v4f32_v4f32__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -1703,7 +1724,8 @@ define void @v_shuffle_v4f32_v4f32__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -1787,7 +1809,8 @@ define void @v_shuffle_v4f32_v4f32__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -1801,7 +1824,8 @@ define void @v_shuffle_v4f32_v4f32__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -1890,7 +1914,8 @@ define void @v_shuffle_v4f32_v4f32__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -1908,7 +1933,8 @@ define void @v_shuffle_v4f32_v4f32__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -2012,7 +2038,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -2030,7 +2057,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -2186,16 +2214,17 @@ define void @v_shuffle_v4f32_v4f32__7_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2203,16 +2232,17 @@ define void @v_shuffle_v4f32_v4f32__7_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
-; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v0
-; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v6, v0
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -2309,11 +2339,12 @@ define void @v_shuffle_v4f32_v4f32__7_4_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2323,15 +2354,15 @@ define void @v_shuffle_v4f32_v4f32__7_4_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v0
-; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v6, v0
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -2429,7 +2460,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -2447,7 +2479,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -2610,10 +2643,11 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2628,10 +2662,11 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v5
+; GFX940-NEXT: v_mov_b32_e32 v4, v1
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -2724,16 +2759,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v7
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2741,16 +2777,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v7
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -2846,11 +2883,11 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2860,15 +2897,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -3481,7 +3517,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v1
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -3499,7 +3536,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v1
; GFX940-NEXT: v_mov_b32_e32 v5, v1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -3535,16 +3573,17 @@ define void @v_shuffle_v4f32_v4f32__7_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3552,16 +3591,17 @@ define void @v_shuffle_v4f32_v4f32__7_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
-; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v1
-; GFX940-NEXT: v_mov_b32_e32 v5, v1
-; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v7, v1
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -3657,11 +3697,12 @@ define void @v_shuffle_v4f32_v4f32__7_4_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3671,15 +3712,15 @@ define void @v_shuffle_v4f32_v4f32__7_4_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v1
-; GFX940-NEXT: v_mov_b32_e32 v5, v1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v7, v1
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -3777,7 +3818,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: v_mov_b32_e32 v4, v1
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -3795,7 +3837,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: v_mov_b32_e32 v4, v1
; GFX940-NEXT: v_mov_b32_e32 v5, v1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -4401,7 +4444,8 @@ define void @v_shuffle_v4f32_v4f32__1_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -4414,7 +4458,8 @@ define void @v_shuffle_v4f32_v4f32__1_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -4495,7 +4540,8 @@ define void @v_shuffle_v4f32_v4f32__3_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -4508,7 +4554,8 @@ define void @v_shuffle_v4f32_v4f32__3_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -4592,7 +4639,8 @@ define void @v_shuffle_v4f32_v4f32__5_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -4608,8 +4656,8 @@ define void @v_shuffle_v4f32_v4f32__5_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -4707,7 +4755,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -4723,8 +4772,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -4814,15 +4863,17 @@ define void @v_shuffle_v4f32_v4f32__7_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4830,15 +4881,17 @@ define void @v_shuffle_v4f32_v4f32__7_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: v_mov_b32_e32 v6, v2
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -4989,7 +5042,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -5006,7 +5060,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -5103,7 +5158,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -5120,7 +5176,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -5331,16 +5388,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v7
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5348,16 +5406,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v7
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -5390,16 +5449,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v7
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5407,16 +5467,17 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v7
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -5507,16 +5568,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v7
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5524,17 +5585,16 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v8, 0
+; GFX940-NEXT: v_mov_b32_e32 v10, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v6, v7
+; GFX940-NEXT: v_mov_b32_e32 v8, v5
+; GFX940-NEXT: v_mov_b32_e32 v9, v2
+; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -6133,15 +6193,17 @@ define void @v_shuffle_v4f32_v4f32__7_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6149,15 +6211,17 @@ define void @v_shuffle_v4f32_v4f32__7_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -6250,7 +6314,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6266,8 +6331,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -6307,7 +6372,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6324,7 +6390,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -6421,7 +6488,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6438,7 +6506,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -7126,7 +7195,8 @@ define void @v_shuffle_v4f32_v4f32__5_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -7140,7 +7210,8 @@ define void @v_shuffle_v4f32_v4f32__5_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -7226,7 +7297,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -7240,7 +7312,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -7331,7 +7404,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_mov_b32_e32 v6, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
@@ -7349,7 +7423,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: v_mov_b32_e32 v6, v2
; GFX940-NEXT: v_mov_b32_e32 v7, v2
; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
@@ -7448,7 +7523,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
@@ -7465,8 +7541,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_4_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
@@ -7610,28 +7686,31 @@ define void @v_shuffle_v4f32_v4f32__7_6_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_6_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_6_4_4:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v0
-; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v6, v0
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -7825,11 +7904,11 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7839,15 +7918,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -7944,9 +8022,10 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v0, v7
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -7961,10 +8040,10 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v1, v7
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -7997,8 +8076,9 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -8010,8 +8090,9 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v4, v1
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -8622,10 +8703,12 @@ define void @v_shuffle_v4f32_v4f32__7_0_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -8635,14 +8718,16 @@ define void @v_shuffle_v4f32_v4f32__7_0_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -8737,7 +8822,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v5
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
@@ -8754,8 +8840,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v2, v5
; GFX940-NEXT: v_mov_b32_e32 v3, v5
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
@@ -8852,7 +8938,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v1
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -8866,7 +8953,8 @@ define void @v_shuffle_v4f32_v4f32__7_4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v1
; GFX940-NEXT: v_mov_b32_e32 v5, v1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -8897,28 +8985,31 @@ define void @v_shuffle_v4f32_v4f32__7_6_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_6_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_6_5_5:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v1
-; GFX940-NEXT: v_mov_b32_e32 v5, v1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v7, v1
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -9493,7 +9584,8 @@ define void @v_shuffle_v4f32_v4f32__1_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9510,7 +9602,8 @@ define void @v_shuffle_v4f32_v4f32__1_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: v_mov_b32_e32 v5, v4
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -9608,7 +9701,8 @@ define void @v_shuffle_v4f32_v4f32__3_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v6
; GFX90A-NEXT: v_mov_b32_e32 v7, v6
; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9624,8 +9718,8 @@ define void @v_shuffle_v4f32_v4f32__3_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v5, v6
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
; GFX940-NEXT: v_mov_b32_e32 v7, v6
; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -9705,7 +9799,8 @@ define void @v_shuffle_v4f32_v4f32__5_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9718,7 +9813,8 @@ define void @v_shuffle_v4f32_v4f32__5_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -9801,7 +9897,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9814,7 +9911,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -9900,7 +9998,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9917,7 +10016,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v4
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -10016,7 +10116,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v6
; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -10032,8 +10133,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
; GFX940-NEXT: v_mov_b32_e32 v7, v6
; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -10121,26 +10222,31 @@ define void @v_shuffle_v4f32_v4f32__7_4_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_4_6_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_4_6_6:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: v_mov_b32_e32 v6, v2
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -10380,14 +10486,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v5
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v4
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -10397,15 +10504,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v10, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v8, v1
+; GFX940-NEXT: v_mov_b32_e32 v6, v5
+; GFX940-NEXT: v_mov_b32_e32 v7, v5
+; GFX940-NEXT: v_mov_b32_e32 v9, v4
+; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -10503,9 +10610,10 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v0, v7
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v6
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10520,10 +10628,10 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v1, v7
+; GFX940-NEXT: v_mov_b32_e32 v3, v6
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10606,26 +10714,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_5_6:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -11133,7 +11246,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_7_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v5
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -11150,7 +11264,8 @@ define void @v_shuffle_v4f32_v4f32__7_0_7_7(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v5
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -11249,7 +11364,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_7_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_mov_b32_e32 v6, v7
; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -11265,8 +11381,8 @@ define void @v_shuffle_v4f32_v4f32__7_2_7_7(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
; GFX940-NEXT: v_mov_b32_e32 v6, v7
; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -11355,26 +11471,31 @@ define void @v_shuffle_v4f32_v4f32__7_4_7_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_4_7_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_4_7_7:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -11452,7 +11573,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_7_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -11465,7 +11587,8 @@ define void @v_shuffle_v4f32_v4f32__7_6_7_7(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll
index 94c61f5ad0e866c..86e8e2ed267dc62 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll
@@ -176,7 +176,8 @@ define void @v_shuffle_v4i32_v2i32__3_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -191,8 +192,8 @@ define void @v_shuffle_v4i32_v2i32__3_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -565,15 +566,16 @@ define void @v_shuffle_v4i32_v2i32__3_3_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -581,15 +583,16 @@ define void @v_shuffle_v4i32_v2i32__3_3_3_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -672,26 +675,31 @@ define void @v_shuffle_v4i32_v2i32__3_3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_3_2:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -868,7 +876,8 @@ define void @v_shuffle_v4i32_v2i32__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -881,7 +890,8 @@ define void @v_shuffle_v4i32_v2i32__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -968,7 +978,8 @@ define void @v_shuffle_v4i32_v2i32__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -984,7 +995,8 @@ define void @v_shuffle_v4i32_v2i32__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1132,13 +1144,14 @@ define void @v_shuffle_v4i32_v2i32__3_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1148,13 +1161,14 @@ define void @v_shuffle_v4i32_v2i32__3_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -1295,15 +1309,16 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1311,15 +1326,16 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -1713,7 +1729,8 @@ define void @v_shuffle_v4i32_v2i32__3_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1729,7 +1746,8 @@ define void @v_shuffle_v4i32_v2i32__3_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1764,13 +1782,14 @@ define void @v_shuffle_v4i32_v2i32__3_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1780,13 +1799,14 @@ define void @v_shuffle_v4i32_v2i32__3_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -2138,7 +2158,8 @@ define void @v_shuffle_v4i32_v2i32__3_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -2151,7 +2172,8 @@ define void @v_shuffle_v4i32_v2i32__3_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -2230,15 +2252,16 @@ define void @v_shuffle_v4i32_v2i32__3_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2246,15 +2269,16 @@ define void @v_shuffle_v4i32_v2i32__3_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -2498,13 +2522,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2514,13 +2540,15 @@ define void @v_shuffle_v4i32_v2i32__3_3_1_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -2804,15 +2832,16 @@ define void @v_shuffle_v4i32_v2i32__3_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2820,15 +2849,16 @@ define void @v_shuffle_v4i32_v2i32__3_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -2914,7 +2944,8 @@ define void @v_shuffle_v4i32_v2i32__3_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -2927,7 +2958,8 @@ define void @v_shuffle_v4i32_v2i32__3_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll
index 1b003a7c5d9bc2a..770c1c19087b4ef 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll
@@ -1280,7 +1280,8 @@ define void @v_shuffle_v4i32_v3i32__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
@@ -1295,7 +1296,8 @@ define void @v_shuffle_v4i32_v3i32__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
@@ -1433,7 +1435,8 @@ define void @v_shuffle_v4i32_v3i32__4_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
@@ -1450,8 +1453,8 @@ define void @v_shuffle_v4i32_v3i32__4_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
@@ -1961,14 +1964,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1978,15 +1982,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v5, 0
+; GFX940-NEXT: v_mov_b32_e32 v7, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:4]
+; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v2
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2138,16 +2142,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v9, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_mov_b32_e32 v2, v7
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2155,17 +2160,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:2]
+; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v5, 0
+; GFX940-NEXT: v_mov_b32_e32 v9, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:4]
+; GFX940-NEXT: ; def v[6:8]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v0, v8
+; GFX940-NEXT: v_mov_b32_e32 v1, v8
+; GFX940-NEXT: v_mov_b32_e32 v2, v7
+; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3272,9 +3277,9 @@ define void @v_shuffle_v4i32_v3i32__1_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -3287,8 +3292,8 @@ define void @v_shuffle_v4i32_v3i32__1_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -3416,12 +3421,12 @@ define void @v_shuffle_v4i32_v3i32__4_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -3434,12 +3439,12 @@ define void @v_shuffle_v4i32_v3i32__4_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v7, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -3986,17 +3991,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4004,17 +4009,18 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:2]
+; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v7, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
+; GFX940-NEXT: v_mov_b32_e32 v5, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:6]
+; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v6
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v0, v2
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4106,17 +4112,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_mov_b32_e32 v2, v7
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4124,18 +4130,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:2]
+; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v7, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
+; GFX940-NEXT: v_mov_b32_e32 v5, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:6]
+; GFX940-NEXT: ; def v[6:8]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v6
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v0, v8
+; GFX940-NEXT: v_mov_b32_e32 v1, v8
+; GFX940-NEXT: v_mov_b32_e32 v2, v7
+; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4312,7 +4317,8 @@ define void @v_shuffle_v4i32_v3i32__4_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
@@ -4327,7 +4333,8 @@ define void @v_shuffle_v4i32_v3i32__4_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
@@ -4851,16 +4858,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4868,17 +4876,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_3(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:2]
+; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v5, 0
+; GFX940-NEXT: v_mov_b32_e32 v7, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:4]
+; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4964,28 +4972,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_4_3:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:4]
+; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v5, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -6083,9 +6094,9 @@ define void @v_shuffle_v4i32_v3i32__1_5_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6102,8 +6113,8 @@ define void @v_shuffle_v4i32_v3i32__1_5_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -6241,9 +6252,9 @@ define void @v_shuffle_v4i32_v3i32__4_5_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6256,8 +6267,8 @@ define void @v_shuffle_v4i32_v3i32__4_5_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -6716,16 +6727,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v6
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6733,17 +6745,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:2]
+; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v5, 0
+; GFX940-NEXT: v_mov_b32_e32 v7, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:4]
+; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: v_mov_b32_e32 v3, v6
+; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -6881,10 +6893,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6897,8 +6910,9 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v5, 0
; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll
index 47ad1c4bedb8b1d..3d1bb1da92e09b9 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll
@@ -343,12 +343,13 @@ define void @v_shuffle_v4i32_v4i32__7_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -363,8 +364,9 @@ define void @v_shuffle_v4i32_v4i32__7_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -454,7 +456,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -469,8 +472,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -557,8 +560,9 @@ define void @v_shuffle_v4i32_v4i32__7_4_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -569,8 +573,9 @@ define void @v_shuffle_v4i32_v4i32__7_4_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -644,7 +649,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -656,7 +662,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -1162,10 +1169,11 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1180,10 +1188,11 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v5
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1278,16 +1287,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v7
+; GFX90A-NEXT: v_mov_b32_e32 v6, v7
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1295,17 +1305,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v7
+; GFX940-NEXT: v_mov_b32_e32 v6, v7
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1392,26 +1402,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_7_4:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1493,26 +1508,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_7_6:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1689,7 +1709,8 @@ define void @v_shuffle_v4i32_v4i32__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -1703,7 +1724,8 @@ define void @v_shuffle_v4i32_v4i32__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -1787,7 +1809,8 @@ define void @v_shuffle_v4i32_v4i32__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -1801,7 +1824,8 @@ define void @v_shuffle_v4i32_v4i32__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -1890,7 +1914,8 @@ define void @v_shuffle_v4i32_v4i32__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -1908,7 +1933,8 @@ define void @v_shuffle_v4i32_v4i32__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -2012,7 +2038,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -2030,7 +2057,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -2186,16 +2214,17 @@ define void @v_shuffle_v4i32_v4i32__7_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2203,16 +2232,17 @@ define void @v_shuffle_v4i32_v4i32__7_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
-; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v0
-; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v6, v0
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -2309,11 +2339,12 @@ define void @v_shuffle_v4i32_v4i32__7_4_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2323,15 +2354,15 @@ define void @v_shuffle_v4i32_v4i32__7_4_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v0
-; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v6, v0
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -2429,7 +2460,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -2447,7 +2479,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -2610,10 +2643,11 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2628,10 +2662,11 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v5
+; GFX940-NEXT: v_mov_b32_e32 v4, v1
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -2724,16 +2759,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v7
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2741,16 +2777,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v7
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -2846,11 +2883,11 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2860,15 +2897,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -3481,7 +3517,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v1
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -3499,7 +3536,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v1
; GFX940-NEXT: v_mov_b32_e32 v5, v1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -3535,16 +3573,17 @@ define void @v_shuffle_v4i32_v4i32__7_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3552,16 +3591,17 @@ define void @v_shuffle_v4i32_v4i32__7_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
-; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v1
-; GFX940-NEXT: v_mov_b32_e32 v5, v1
-; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v7, v1
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -3657,11 +3697,12 @@ define void @v_shuffle_v4i32_v4i32__7_4_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3671,15 +3712,15 @@ define void @v_shuffle_v4i32_v4i32__7_4_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v1
-; GFX940-NEXT: v_mov_b32_e32 v5, v1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v7, v1
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -3777,7 +3818,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: v_mov_b32_e32 v4, v1
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -3795,7 +3837,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: v_mov_b32_e32 v4, v1
; GFX940-NEXT: v_mov_b32_e32 v5, v1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -4401,7 +4444,8 @@ define void @v_shuffle_v4i32_v4i32__1_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -4414,7 +4458,8 @@ define void @v_shuffle_v4i32_v4i32__1_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -4495,7 +4540,8 @@ define void @v_shuffle_v4i32_v4i32__3_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -4508,7 +4554,8 @@ define void @v_shuffle_v4i32_v4i32__3_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -4592,7 +4639,8 @@ define void @v_shuffle_v4i32_v4i32__5_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -4608,8 +4656,8 @@ define void @v_shuffle_v4i32_v4i32__5_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -4707,7 +4755,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -4723,8 +4772,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -4814,15 +4863,17 @@ define void @v_shuffle_v4i32_v4i32__7_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4830,15 +4881,17 @@ define void @v_shuffle_v4i32_v4i32__7_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: v_mov_b32_e32 v6, v2
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -4989,7 +5042,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -5006,7 +5060,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -5103,7 +5158,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -5120,7 +5176,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -5331,16 +5388,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v7
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5348,16 +5406,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v7
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -5390,16 +5449,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v7
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5407,16 +5467,17 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v7
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -5507,16 +5568,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v7
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5524,17 +5585,16 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v8, 0
+; GFX940-NEXT: v_mov_b32_e32 v10, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v6, v7
+; GFX940-NEXT: v_mov_b32_e32 v8, v5
+; GFX940-NEXT: v_mov_b32_e32 v9, v2
+; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -6133,15 +6193,17 @@ define void @v_shuffle_v4i32_v4i32__7_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6149,15 +6211,17 @@ define void @v_shuffle_v4i32_v4i32__7_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -6250,7 +6314,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6266,8 +6331,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -6307,7 +6372,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6324,7 +6390,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -6421,7 +6488,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6438,7 +6506,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -7126,7 +7195,8 @@ define void @v_shuffle_v4i32_v4i32__5_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -7140,7 +7210,8 @@ define void @v_shuffle_v4i32_v4i32__5_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -7226,7 +7297,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -7240,7 +7312,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -7331,7 +7404,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_mov_b32_e32 v6, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
@@ -7349,7 +7423,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: v_mov_b32_e32 v6, v2
; GFX940-NEXT: v_mov_b32_e32 v7, v2
; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
@@ -7448,7 +7523,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
@@ -7465,8 +7541,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_4_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
@@ -7610,28 +7686,31 @@ define void @v_shuffle_v4i32_v4i32__7_6_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_6_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_6_4_4:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v0
-; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v6, v0
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -7825,11 +7904,11 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7839,15 +7918,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -7944,9 +8022,10 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v0, v7
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -7961,10 +8040,10 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v1, v7
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -7997,8 +8076,9 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -8010,8 +8090,9 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v4, v1
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -8622,10 +8703,12 @@ define void @v_shuffle_v4i32_v4i32__7_0_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -8635,14 +8718,16 @@ define void @v_shuffle_v4i32_v4i32__7_0_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -8737,7 +8822,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v5
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
@@ -8754,8 +8840,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v2, v5
; GFX940-NEXT: v_mov_b32_e32 v3, v5
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
@@ -8852,7 +8938,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v1
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -8866,7 +8953,8 @@ define void @v_shuffle_v4i32_v4i32__7_4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v1
; GFX940-NEXT: v_mov_b32_e32 v5, v1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -8897,28 +8985,31 @@ define void @v_shuffle_v4i32_v4i32__7_6_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_6_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_6_5_5:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v1
-; GFX940-NEXT: v_mov_b32_e32 v5, v1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v7, v1
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -9493,7 +9584,8 @@ define void @v_shuffle_v4i32_v4i32__1_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9510,7 +9602,8 @@ define void @v_shuffle_v4i32_v4i32__1_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: v_mov_b32_e32 v5, v4
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -9608,7 +9701,8 @@ define void @v_shuffle_v4i32_v4i32__3_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v6
; GFX90A-NEXT: v_mov_b32_e32 v7, v6
; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9624,8 +9718,8 @@ define void @v_shuffle_v4i32_v4i32__3_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v5, v6
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
; GFX940-NEXT: v_mov_b32_e32 v7, v6
; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -9705,7 +9799,8 @@ define void @v_shuffle_v4i32_v4i32__5_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9718,7 +9813,8 @@ define void @v_shuffle_v4i32_v4i32__5_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -9801,7 +9897,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9814,7 +9911,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -9900,7 +9998,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9917,7 +10016,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v4
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -10016,7 +10116,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v6
; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -10032,8 +10133,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
; GFX940-NEXT: v_mov_b32_e32 v7, v6
; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -10121,26 +10222,31 @@ define void @v_shuffle_v4i32_v4i32__7_4_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_4_6_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_4_6_6:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: v_mov_b32_e32 v6, v2
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -10380,14 +10486,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v5
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v4
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -10397,15 +10504,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v10, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v8, v1
+; GFX940-NEXT: v_mov_b32_e32 v6, v5
+; GFX940-NEXT: v_mov_b32_e32 v7, v5
+; GFX940-NEXT: v_mov_b32_e32 v9, v4
+; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -10503,9 +10610,10 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v0, v7
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v6
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10520,10 +10628,10 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v1, v7
+; GFX940-NEXT: v_mov_b32_e32 v3, v6
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10606,26 +10714,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_5_6:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -11133,7 +11246,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_7_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v5
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -11150,7 +11264,8 @@ define void @v_shuffle_v4i32_v4i32__7_0_7_7(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v5
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -11249,7 +11364,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_7_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_mov_b32_e32 v6, v7
; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -11265,8 +11381,8 @@ define void @v_shuffle_v4i32_v4i32__7_2_7_7(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
; GFX940-NEXT: v_mov_b32_e32 v6, v7
; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -11355,26 +11471,31 @@ define void @v_shuffle_v4i32_v4i32__7_4_7_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_4_7_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_4_7_7:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -11452,7 +11573,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_7_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -11465,7 +11587,8 @@ define void @v_shuffle_v4i32_v4i32__7_6_7_7(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll
index f9253ca1b1ea46a..2dd794404158c3a 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll
@@ -176,7 +176,8 @@ define void @v_shuffle_v4p3_v2p3__3_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -191,8 +192,8 @@ define void @v_shuffle_v4p3_v2p3__3_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -565,15 +566,16 @@ define void @v_shuffle_v4p3_v2p3__3_3_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -581,15 +583,16 @@ define void @v_shuffle_v4p3_v2p3__3_3_3_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -672,26 +675,31 @@ define void @v_shuffle_v4p3_v2p3__3_3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_3_2:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -868,7 +876,8 @@ define void @v_shuffle_v4p3_v2p3__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -881,7 +890,8 @@ define void @v_shuffle_v4p3_v2p3__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -968,7 +978,8 @@ define void @v_shuffle_v4p3_v2p3__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -984,7 +995,8 @@ define void @v_shuffle_v4p3_v2p3__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1132,13 +1144,14 @@ define void @v_shuffle_v4p3_v2p3__3_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1148,13 +1161,14 @@ define void @v_shuffle_v4p3_v2p3__3_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1295,15 +1309,16 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1311,15 +1326,16 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1713,7 +1729,8 @@ define void @v_shuffle_v4p3_v2p3__3_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1729,7 +1746,8 @@ define void @v_shuffle_v4p3_v2p3__3_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1764,13 +1782,14 @@ define void @v_shuffle_v4p3_v2p3__3_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1780,13 +1799,14 @@ define void @v_shuffle_v4p3_v2p3__3_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2138,7 +2158,8 @@ define void @v_shuffle_v4p3_v2p3__3_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -2151,7 +2172,8 @@ define void @v_shuffle_v4p3_v2p3__3_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -2230,15 +2252,16 @@ define void @v_shuffle_v4p3_v2p3__3_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2246,15 +2269,16 @@ define void @v_shuffle_v4p3_v2p3__3_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2498,13 +2522,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2514,13 +2540,15 @@ define void @v_shuffle_v4p3_v2p3__3_3_1_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2804,15 +2832,16 @@ define void @v_shuffle_v4p3_v2p3__3_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2820,15 +2849,16 @@ define void @v_shuffle_v4p3_v2p3__3_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2914,7 +2944,8 @@ define void @v_shuffle_v4p3_v2p3__3_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -2927,7 +2958,8 @@ define void @v_shuffle_v4p3_v2p3__3_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll
index 28bc61ce57815ad..d6bd85428318019 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll
@@ -1280,7 +1280,8 @@ define void @v_shuffle_v4p3_v3p3__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
@@ -1295,7 +1296,8 @@ define void @v_shuffle_v4p3_v3p3__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
@@ -1433,7 +1435,8 @@ define void @v_shuffle_v4p3_v3p3__4_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
@@ -1450,8 +1453,8 @@ define void @v_shuffle_v4p3_v3p3__4_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
@@ -1961,14 +1964,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1978,15 +1982,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v5, 0
+; GFX940-NEXT: v_mov_b32_e32 v7, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:4]
+; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v2
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2138,16 +2142,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v9, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_mov_b32_e32 v2, v7
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2155,17 +2160,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:2]
+; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v5, 0
+; GFX940-NEXT: v_mov_b32_e32 v9, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:4]
+; GFX940-NEXT: ; def v[6:8]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v0, v8
+; GFX940-NEXT: v_mov_b32_e32 v1, v8
+; GFX940-NEXT: v_mov_b32_e32 v2, v7
+; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3272,9 +3277,9 @@ define void @v_shuffle_v4p3_v3p3__1_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -3287,8 +3292,8 @@ define void @v_shuffle_v4p3_v3p3__1_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -3416,12 +3421,12 @@ define void @v_shuffle_v4p3_v3p3__4_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -3434,12 +3439,12 @@ define void @v_shuffle_v4p3_v3p3__4_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v7, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -3986,17 +3991,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4004,17 +4009,18 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:2]
+; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v7, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
+; GFX940-NEXT: v_mov_b32_e32 v5, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:6]
+; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v6
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v0, v2
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4106,17 +4112,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_mov_b32_e32 v2, v7
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4124,18 +4130,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:2]
+; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v7, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
+; GFX940-NEXT: v_mov_b32_e32 v5, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:6]
+; GFX940-NEXT: ; def v[6:8]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v6
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v0, v8
+; GFX940-NEXT: v_mov_b32_e32 v1, v8
+; GFX940-NEXT: v_mov_b32_e32 v2, v7
+; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4312,7 +4317,8 @@ define void @v_shuffle_v4p3_v3p3__4_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
@@ -4327,7 +4333,8 @@ define void @v_shuffle_v4p3_v3p3__4_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
@@ -4851,16 +4858,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4868,17 +4876,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_3(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:2]
+; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v5, 0
+; GFX940-NEXT: v_mov_b32_e32 v7, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:4]
+; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4964,28 +4972,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_4_3:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:4]
+; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v5, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -6083,9 +6094,9 @@ define void @v_shuffle_v4p3_v3p3__1_5_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6102,8 +6113,8 @@ define void @v_shuffle_v4p3_v3p3__1_5_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -6241,9 +6252,9 @@ define void @v_shuffle_v4p3_v3p3__4_5_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6256,8 +6267,8 @@ define void @v_shuffle_v4p3_v3p3__4_5_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -6716,16 +6727,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v6
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6733,17 +6745,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:2]
+; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v5, 0
+; GFX940-NEXT: v_mov_b32_e32 v7, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:4]
+; GFX940-NEXT: ; def v[4:6]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: v_mov_b32_e32 v3, v6
+; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -6881,10 +6893,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6897,8 +6910,9 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v5, 0
; GFX940-NEXT: v_mov_b32_e32 v0, v4
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll
index 9cc1b9fe6cf0e08..e4c7cebb231bd88 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll
@@ -343,12 +343,13 @@ define void @v_shuffle_v4p3_v4p3__7_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -363,8 +364,9 @@ define void @v_shuffle_v4p3_v4p3__7_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -454,7 +456,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -469,8 +472,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -557,8 +560,9 @@ define void @v_shuffle_v4p3_v4p3__7_4_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -569,8 +573,9 @@ define void @v_shuffle_v4p3_v4p3__7_4_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -644,7 +649,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -656,7 +662,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -1162,10 +1169,11 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1180,10 +1188,11 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v5
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1278,16 +1287,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v7
+; GFX90A-NEXT: v_mov_b32_e32 v6, v7
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1295,17 +1305,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v7
+; GFX940-NEXT: v_mov_b32_e32 v6, v7
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1392,26 +1402,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_7_4:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1493,26 +1508,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_7_6:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1689,7 +1709,8 @@ define void @v_shuffle_v4p3_v4p3__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -1703,7 +1724,8 @@ define void @v_shuffle_v4p3_v4p3__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -1787,7 +1809,8 @@ define void @v_shuffle_v4p3_v4p3__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -1801,7 +1824,8 @@ define void @v_shuffle_v4p3_v4p3__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -1890,7 +1914,8 @@ define void @v_shuffle_v4p3_v4p3__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -1908,7 +1933,8 @@ define void @v_shuffle_v4p3_v4p3__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -2012,7 +2038,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -2030,7 +2057,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -2186,16 +2214,17 @@ define void @v_shuffle_v4p3_v4p3__7_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2203,16 +2232,17 @@ define void @v_shuffle_v4p3_v4p3__7_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
-; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v0
-; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v6, v0
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2309,11 +2339,12 @@ define void @v_shuffle_v4p3_v4p3__7_4_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2323,15 +2354,15 @@ define void @v_shuffle_v4p3_v4p3__7_4_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v0
-; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v6, v0
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2429,7 +2460,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -2447,7 +2479,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -2610,10 +2643,11 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2628,10 +2662,11 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v5
+; GFX940-NEXT: v_mov_b32_e32 v4, v1
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2724,16 +2759,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v7
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2741,16 +2777,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v7
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2846,11 +2883,11 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2860,15 +2897,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3481,7 +3517,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v1
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -3499,7 +3536,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v1
; GFX940-NEXT: v_mov_b32_e32 v5, v1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -3535,16 +3573,17 @@ define void @v_shuffle_v4p3_v4p3__7_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3552,16 +3591,17 @@ define void @v_shuffle_v4p3_v4p3__7_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
-; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v1
-; GFX940-NEXT: v_mov_b32_e32 v5, v1
-; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v7, v1
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3657,11 +3697,12 @@ define void @v_shuffle_v4p3_v4p3__7_4_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3671,15 +3712,15 @@ define void @v_shuffle_v4p3_v4p3__7_4_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v1
-; GFX940-NEXT: v_mov_b32_e32 v5, v1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v7, v1
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3777,7 +3818,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: v_mov_b32_e32 v4, v1
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -3795,7 +3837,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: v_mov_b32_e32 v4, v1
; GFX940-NEXT: v_mov_b32_e32 v5, v1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -4401,7 +4444,8 @@ define void @v_shuffle_v4p3_v4p3__1_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -4414,7 +4458,8 @@ define void @v_shuffle_v4p3_v4p3__1_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -4495,7 +4540,8 @@ define void @v_shuffle_v4p3_v4p3__3_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -4508,7 +4554,8 @@ define void @v_shuffle_v4p3_v4p3__3_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -4592,7 +4639,8 @@ define void @v_shuffle_v4p3_v4p3__5_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -4608,8 +4656,8 @@ define void @v_shuffle_v4p3_v4p3__5_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -4707,7 +4755,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -4723,8 +4772,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -4814,15 +4863,17 @@ define void @v_shuffle_v4p3_v4p3__7_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4830,15 +4881,17 @@ define void @v_shuffle_v4p3_v4p3__7_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: v_mov_b32_e32 v6, v2
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4989,7 +5042,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -5006,7 +5060,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -5103,7 +5158,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -5120,7 +5176,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_2_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -5331,16 +5388,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v7
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5348,16 +5406,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v7
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5390,16 +5449,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v7
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5407,16 +5467,17 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v7
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5507,16 +5568,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v7
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5524,17 +5585,16 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_2(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v8, 0
+; GFX940-NEXT: v_mov_b32_e32 v10, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v6, v7
+; GFX940-NEXT: v_mov_b32_e32 v8, v5
+; GFX940-NEXT: v_mov_b32_e32 v9, v2
+; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -6133,15 +6193,17 @@ define void @v_shuffle_v4p3_v4p3__7_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6149,15 +6211,17 @@ define void @v_shuffle_v4p3_v4p3__7_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:3]
+; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[4:7]
+; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -6250,7 +6314,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6266,8 +6331,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -6307,7 +6372,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6324,7 +6390,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -6421,7 +6488,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6438,7 +6506,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -7126,7 +7195,8 @@ define void @v_shuffle_v4p3_v4p3__5_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -7140,7 +7210,8 @@ define void @v_shuffle_v4p3_v4p3__5_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -7226,7 +7297,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -7240,7 +7312,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -7331,7 +7404,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_mov_b32_e32 v6, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v2
; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
@@ -7349,7 +7423,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[4:5], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: v_mov_b32_e32 v6, v2
; GFX940-NEXT: v_mov_b32_e32 v7, v2
; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
@@ -7448,7 +7523,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
@@ -7465,8 +7541,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_4_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
@@ -7610,28 +7686,31 @@ define void @v_shuffle_v4p3_v4p3__7_6_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_6_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_6_4_4:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v0
-; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v6, v0
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -7825,11 +7904,11 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7839,15 +7918,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -7944,9 +8022,10 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v0, v7
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -7961,10 +8040,10 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v1, v7
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -7997,8 +8076,9 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -8010,8 +8090,9 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v4, v1
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -8622,10 +8703,12 @@ define void @v_shuffle_v4p3_v4p3__7_0_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -8635,14 +8718,16 @@ define void @v_shuffle_v4p3_v4p3__7_0_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v4, v5
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -8737,7 +8822,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v7
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v5
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
@@ -8754,8 +8840,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v2, v5
; GFX940-NEXT: v_mov_b32_e32 v3, v5
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
@@ -8852,7 +8938,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v1
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
@@ -8866,7 +8953,8 @@ define void @v_shuffle_v4p3_v4p3__7_4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v1
; GFX940-NEXT: v_mov_b32_e32 v5, v1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
@@ -8897,28 +8985,31 @@ define void @v_shuffle_v4p3_v4p3__7_6_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_6_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_6_5_5:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v4, v1
-; GFX940-NEXT: v_mov_b32_e32 v5, v1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v7, v1
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -9493,7 +9584,8 @@ define void @v_shuffle_v4p3_v4p3__1_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9510,7 +9602,8 @@ define void @v_shuffle_v4p3_v4p3__1_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: v_mov_b32_e32 v5, v4
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -9608,7 +9701,8 @@ define void @v_shuffle_v4p3_v4p3__3_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v6
; GFX90A-NEXT: v_mov_b32_e32 v7, v6
; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9624,8 +9718,8 @@ define void @v_shuffle_v4p3_v4p3__3_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[6:7] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v5, v6
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
; GFX940-NEXT: v_mov_b32_e32 v7, v6
; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -9705,7 +9799,8 @@ define void @v_shuffle_v4p3_v4p3__5_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9718,7 +9813,8 @@ define void @v_shuffle_v4p3_v4p3__5_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -9801,7 +9897,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9814,7 +9911,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -9900,7 +9998,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -9917,7 +10016,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v5, v4
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -10016,7 +10116,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v6
; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -10032,8 +10133,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_6_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
; GFX940-NEXT: v_mov_b32_e32 v7, v6
; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -10121,26 +10222,31 @@ define void @v_shuffle_v4p3_v4p3__7_4_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_4_6_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_4_6_6:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: v_mov_b32_e32 v6, v2
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -10380,14 +10486,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v5
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v4
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -10397,15 +10504,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v10, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v8, v1
+; GFX940-NEXT: v_mov_b32_e32 v6, v5
+; GFX940-NEXT: v_mov_b32_e32 v7, v5
+; GFX940-NEXT: v_mov_b32_e32 v9, v4
+; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -10503,9 +10610,10 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v0, v7
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v6
; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10520,10 +10628,10 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[6:7] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: v_mov_b32_e32 v0, v7
; GFX940-NEXT: v_mov_b32_e32 v1, v7
+; GFX940-NEXT: v_mov_b32_e32 v3, v6
; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10606,26 +10714,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_5_6:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[2:3] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v6, v1
+; GFX940-NEXT: v_mov_b32_e32 v7, v2
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -11133,7 +11246,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_7_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, v5
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -11150,7 +11264,8 @@ define void @v_shuffle_v4p3_v4p3__7_0_7_7(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: v_mov_b32_e32 v4, v5
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -11249,7 +11364,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_7_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
; GFX90A-NEXT: v_mov_b32_e32 v6, v7
; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -11265,8 +11381,8 @@ define void @v_shuffle_v4p3_v4p3__7_2_7_7(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v4, v7
+; GFX940-NEXT: v_mov_b32_e32 v5, v2
; GFX940-NEXT: v_mov_b32_e32 v6, v7
; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -11355,26 +11471,31 @@ define void @v_shuffle_v4p3_v4p3__7_4_7_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_4_7_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_4_7_7:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v8, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v4, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v0
+; GFX940-NEXT: v_mov_b32_e32 v6, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -11452,7 +11573,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_7_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -11465,7 +11587,8 @@ define void @v_shuffle_v4p3_v4p3__7_6_7_7(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
More information about the llvm-branch-commits
mailing list