[llvm-branch-commits] [llvm] AMDGPU: Make vector_shuffle legal for v2i32 with v_pk_mov_b32 (PR #123684)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jan 20 20:32:47 PST 2025
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/123684
For VALU shuffles, this saves an instruction in some case.
>From c5caf560857f3c4f71416940a528df5ce75212bc Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 9 Jan 2025 18:34:29 +0700
Subject: [PATCH] AMDGPU: Make vector_shuffle legal for v2i32 with v_pk_mov_b32
For VALU shuffles, this saves an instruction in some case.
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 114 +++
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 +
.../AMDGPU/shufflevector.v2f32.v2f32.ll | 49 +-
.../AMDGPU/shufflevector.v2f32.v3f32.ll | 40 +-
.../AMDGPU/shufflevector.v2f32.v4f32.ll | 84 +-
.../AMDGPU/shufflevector.v2f32.v8f32.ll | 272 +++---
.../AMDGPU/shufflevector.v2i32.v2i32.ll | 49 +-
.../AMDGPU/shufflevector.v2i32.v3i32.ll | 40 +-
.../AMDGPU/shufflevector.v2i32.v4i32.ll | 84 +-
.../AMDGPU/shufflevector.v2i32.v8i32.ll | 272 +++---
.../CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll | 49 +-
.../CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll | 40 +-
.../CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll | 84 +-
.../CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll | 272 +++---
.../AMDGPU/shufflevector.v4i64.v3i64.ll | 787 +++++++++++-------
.../CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll | 787 +++++++++++-------
.../CodeGen/AMDGPU/vector_shuffle.packed.ll | 96 +--
.../InferAddressSpaces/AMDGPU/flat_atomic.ll | 3 +-
19 files changed, 1723 insertions(+), 1407 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 6d5c3b5e0742b3..8d03fde8911242 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -489,6 +489,90 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
}
+void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) {
+ EVT VT = N->getValueType(0);
+ EVT EltVT = VT.getVectorElementType();
+
+ // TODO: Handle 16-bit element vectors with even aligned masks.
+ if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
+ VT.getVectorNumElements() != 2) {
+ SelectCode(N);
+ return;
+ }
+
+ auto *SVN = cast<ShuffleVectorSDNode>(N);
+
+ SDValue Src0 = SVN->getOperand(0);
+ SDValue Src1 = SVN->getOperand(1);
+ ArrayRef<int> Mask = SVN->getMask();
+ SDLoc DL(N);
+
+ assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
+ Mask[0] < 4 && Mask[1] < 4);
+
+ SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
+ SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
+ unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
+ unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
+
+ if (Mask[0] < 0) {
+ Src0SubReg = Src1SubReg;
+ MachineSDNode *ImpDef =
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
+ VSrc0 = SDValue(ImpDef, 0);
+ }
+
+ if (Mask[1] < 0) {
+ Src1SubReg = Src0SubReg;
+ MachineSDNode *ImpDef =
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
+ VSrc1 = SDValue(ImpDef, 0);
+ }
+
+ // SGPR case needs to lower to copies.
+ //
+ // Also use subregister extract when we can directly blend the registers with
+ // a simple subregister copy.
+ //
+ // TODO: Maybe we should fold this out earlier
+ if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
+ Src1SubReg == AMDGPU::sub0) {
+ // The low element of the result always comes from src0.
+ // The high element of the result always comes from src1.
+ // op_sel selects the high half of src0.
+ // op_sel_hi selects the high half of src1.
+
+ unsigned Src0OpSel =
+ Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
+ unsigned Src1OpSel =
+ Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
+
+ SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
+ SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
+ SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+ CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
+ {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
+ ZeroMods, // clamp
+ ZeroMods, // op_sel
+ ZeroMods, // op_sel_hi
+ ZeroMods, // neg_lo
+ ZeroMods}); // neg_hi
+ return;
+ }
+
+ SDValue ResultElt0 =
+ CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
+ SDValue ResultElt1 =
+ CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
+
+ const SDValue Ops[] = {
+ CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
+ ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+ ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
+ CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
+}
+
void AMDGPUDAGToDAGISel::Select(SDNode *N) {
unsigned int Opc = N->getOpcode();
if (N->isMachineOpcode()) {
@@ -562,6 +646,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectBuildVector(N, RegClassID);
return;
}
+ case ISD::VECTOR_SHUFFLE:
+ SelectVectorShuffle(N);
+ return;
case ISD::BUILD_PAIR: {
SDValue RC, SubReg0, SubReg1;
SDLoc DL(N);
@@ -3101,6 +3188,33 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
}
Mods = VecMods;
+ } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
+ Src.getNumOperands() == 2) {
+
+ // TODO: We should repeat the build_vector source check above for the
+ // vector_shuffle for negates and casts of individual elements.
+
+ auto *SVN = cast<ShuffleVectorSDNode>(Src);
+ ArrayRef<int> Mask = SVN->getMask();
+
+ if (Mask[0] < 2 && Mask[1] < 2) {
+ // src1 should be undef.
+ SDValue ShuffleSrc = SVN->getOperand(0);
+
+ if (ShuffleSrc.getOpcode() == ISD::FNEG) {
+ ShuffleSrc = ShuffleSrc.getOperand(0);
+ Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+ }
+
+ if (Mask[0] == 1)
+ Mods |= SISrcMods::OP_SEL_0;
+ if (Mask[1] == 1)
+ Mods |= SISrcMods::OP_SEL_1;
+
+ Src = ShuffleSrc;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+ }
}
// Packed instructions do not have abs modifiers.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 7e61eb470622f1..7dcd208a9cdd41 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -86,6 +86,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
protected:
void SelectBuildVector(SDNode *N, unsigned RegClassID);
+ void SelectVectorShuffle(SDNode *N);
private:
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6cf5774fc53b06..1aeca7f370aa1b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -422,6 +422,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
{MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
Expand);
+ if (Subtarget->hasPkMovB32()) {
+ // TODO: 16-bit element vectors should be legal with even aligned elements.
+ // TODO: Can be legal with wider source types than the result with
+ // subregister extracts.
+ setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
+ }
+
setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
Custom);
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll
index 3410b067fb5b4e..47b15a032dedb8 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll
@@ -171,15 +171,14 @@ define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -187,15 +186,15 @@ define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -274,27 +273,24 @@ define void @v_shuffle_v2f32_v2f32__3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2f32_v2f32__3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v2f32_v2f32__3_2:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -447,27 +443,24 @@ define void @v_shuffle_v2f32_v2f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2f32_v2f32__1_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v2f32_v2f32__1_0:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
index 7edb6939f884c1..3960a59b65e63e 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
@@ -632,10 +632,9 @@ define void @v_shuffle_v2f32_v3f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -645,10 +644,9 @@ define void @v_shuffle_v2f32_v3f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -765,13 +763,12 @@ define void @v_shuffle_v2f32_v3f32__4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -786,9 +783,8 @@ define void @v_shuffle_v2f32_v3f32__4_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1480,10 +1476,9 @@ define void @v_shuffle_v2f32_v3f32__4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1493,10 +1488,9 @@ define void @v_shuffle_v2f32_v3f32__4_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll
index ea02b31bff04fd..46090aef4a0df5 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll
@@ -335,13 +335,12 @@ define void @v_shuffle_v2f32_v4f32__7_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -356,9 +355,8 @@ define void @v_shuffle_v2f32_v4f32__7_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v5
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -447,8 +445,7 @@ define void @v_shuffle_v2f32_v4f32__7_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -463,8 +460,8 @@ define void @v_shuffle_v2f32_v4f32__7_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -637,8 +634,7 @@ define void @v_shuffle_v2f32_v4f32__7_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -650,8 +646,7 @@ define void @v_shuffle_v2f32_v4f32__7_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -809,9 +804,8 @@ define void @v_shuffle_v2f32_v4f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -822,9 +816,8 @@ define void @v_shuffle_v2f32_v4f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -984,13 +977,12 @@ define void @v_shuffle_v2f32_v4f32__5_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1005,9 +997,8 @@ define void @v_shuffle_v2f32_v4f32__5_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1607,8 +1598,7 @@ define void @v_shuffle_v2f32_v4f32__3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1620,8 +1610,7 @@ define void @v_shuffle_v2f32_v4f32__3_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -1698,8 +1687,7 @@ define void @v_shuffle_v2f32_v4f32__5_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1714,8 +1702,8 @@ define void @v_shuffle_v2f32_v4f32__5_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -2331,9 +2319,8 @@ define void @v_shuffle_v2f32_v4f32__5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2344,9 +2331,8 @@ define void @v_shuffle_v2f32_v4f32__5_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -2898,8 +2884,7 @@ define void @v_shuffle_v2f32_v4f32__1_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -2914,8 +2899,8 @@ define void @v_shuffle_v2f32_v4f32__1_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -3005,8 +2990,7 @@ define void @v_shuffle_v2f32_v4f32__3_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3021,8 +3005,8 @@ define void @v_shuffle_v2f32_v4f32__3_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll
index 0fc63853f63ab8..1915fcdafd69d9 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll
@@ -659,13 +659,12 @@ define void @v_shuffle_v2f32_v8f32__15_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v9
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -680,9 +679,8 @@ define void @v_shuffle_v2f32_v8f32__15_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v9
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x float> asm "; def $0", "=v"()
@@ -771,8 +769,7 @@ define void @v_shuffle_v2f32_v8f32__15_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v11
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -787,8 +784,8 @@ define void @v_shuffle_v2f32_v8f32__15_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v11
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -878,8 +875,7 @@ define void @v_shuffle_v2f32_v8f32__15_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v13
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -894,8 +890,8 @@ define void @v_shuffle_v2f32_v8f32__15_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: v_mov_b32_e32 v0, v13
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -985,8 +981,7 @@ define void @v_shuffle_v2f32_v8f32__15_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v15
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1001,8 +996,8 @@ define void @v_shuffle_v2f32_v8f32__15_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v15
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -1351,8 +1346,7 @@ define void @v_shuffle_v2f32_v8f32__15_14(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1364,8 +1358,7 @@ define void @v_shuffle_v2f32_v8f32__15_14(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -1523,9 +1516,8 @@ define void @v_shuffle_v2f32_v8f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1536,9 +1528,8 @@ define void @v_shuffle_v2f32_v8f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x float> asm "; def $0", "=v"()
@@ -1870,13 +1861,12 @@ define void @v_shuffle_v2f32_v8f32__9_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1891,9 +1881,8 @@ define void @v_shuffle_v2f32_v8f32__9_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x float> asm "; def $0", "=v"()
@@ -1978,13 +1967,12 @@ define void @v_shuffle_v2f32_v8f32__11_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1999,9 +1987,8 @@ define void @v_shuffle_v2f32_v8f32__11_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v5
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x float> asm "; def $0", "=v"()
@@ -2086,13 +2073,12 @@ define void @v_shuffle_v2f32_v8f32__13_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v7
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2107,9 +2093,8 @@ define void @v_shuffle_v2f32_v8f32__13_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v7
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x float> asm "; def $0", "=v"()
@@ -3089,8 +3074,7 @@ define void @v_shuffle_v2f32_v8f32__3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3102,8 +3086,7 @@ define void @v_shuffle_v2f32_v8f32__3_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -3352,8 +3335,7 @@ define void @v_shuffle_v2f32_v8f32__9_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3368,8 +3350,8 @@ define void @v_shuffle_v2f32_v8f32__9_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -3459,8 +3441,7 @@ define void @v_shuffle_v2f32_v8f32__11_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3475,8 +3456,8 @@ define void @v_shuffle_v2f32_v8f32__11_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -3566,8 +3547,7 @@ define void @v_shuffle_v2f32_v8f32__13_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v9
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3582,8 +3562,8 @@ define void @v_shuffle_v2f32_v8f32__13_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v9
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -4650,8 +4630,7 @@ define void @v_shuffle_v2f32_v8f32__5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -4663,8 +4642,7 @@ define void @v_shuffle_v2f32_v8f32__5_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -4827,8 +4805,7 @@ define void @v_shuffle_v2f32_v8f32__9_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -4843,8 +4820,8 @@ define void @v_shuffle_v2f32_v8f32__9_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -4934,8 +4911,7 @@ define void @v_shuffle_v2f32_v8f32__11_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v9
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -4950,8 +4926,8 @@ define void @v_shuffle_v2f32_v8f32__11_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: v_mov_b32_e32 v0, v9
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -5041,8 +5017,7 @@ define void @v_shuffle_v2f32_v8f32__13_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v11
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -5057,8 +5032,8 @@ define void @v_shuffle_v2f32_v8f32__13_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: v_mov_b32_e32 v0, v11
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -6211,8 +6186,7 @@ define void @v_shuffle_v2f32_v8f32__7_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6224,8 +6198,7 @@ define void @v_shuffle_v2f32_v8f32__7_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -6302,8 +6275,7 @@ define void @v_shuffle_v2f32_v8f32__9_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v9
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6318,8 +6290,8 @@ define void @v_shuffle_v2f32_v8f32__9_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v9
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -6409,8 +6381,7 @@ define void @v_shuffle_v2f32_v8f32__11_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v11
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6425,8 +6396,8 @@ define void @v_shuffle_v2f32_v8f32__11_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v11
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -6516,8 +6487,7 @@ define void @v_shuffle_v2f32_v8f32__13_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v13
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6532,8 +6502,8 @@ define void @v_shuffle_v2f32_v8f32__13_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v13
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -7689,9 +7659,8 @@ define void @v_shuffle_v2f32_v8f32__9_8(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7702,9 +7671,8 @@ define void @v_shuffle_v2f32_v8f32__9_8(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x float> asm "; def $0", "=v"()
@@ -8816,8 +8784,7 @@ define void @v_shuffle_v2f32_v8f32__1_10(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -8832,8 +8799,8 @@ define void @v_shuffle_v2f32_v8f32__1_10(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -8923,8 +8890,7 @@ define void @v_shuffle_v2f32_v8f32__3_10(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -8939,8 +8905,8 @@ define void @v_shuffle_v2f32_v8f32__3_10(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -9030,8 +8996,7 @@ define void @v_shuffle_v2f32_v8f32__5_10(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -9046,8 +9011,8 @@ define void @v_shuffle_v2f32_v8f32__5_10(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v8
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -9137,8 +9102,7 @@ define void @v_shuffle_v2f32_v8f32__7_10(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v10
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -9153,8 +9117,8 @@ define void @v_shuffle_v2f32_v8f32__7_10(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v10
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -9315,8 +9279,7 @@ define void @v_shuffle_v2f32_v8f32__11_10(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -9328,8 +9291,7 @@ define void @v_shuffle_v2f32_v8f32__11_10(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10354,8 +10316,7 @@ define void @v_shuffle_v2f32_v8f32__1_12(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10370,8 +10331,8 @@ define void @v_shuffle_v2f32_v8f32__1_12(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10461,8 +10422,7 @@ define void @v_shuffle_v2f32_v8f32__3_12(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10477,8 +10437,8 @@ define void @v_shuffle_v2f32_v8f32__3_12(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v8
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10568,8 +10528,7 @@ define void @v_shuffle_v2f32_v8f32__5_12(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v10
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10584,8 +10543,8 @@ define void @v_shuffle_v2f32_v8f32__5_12(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v10
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10675,8 +10634,7 @@ define void @v_shuffle_v2f32_v8f32__7_12(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v12
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10691,8 +10649,8 @@ define void @v_shuffle_v2f32_v8f32__7_12(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v12
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10941,8 +10899,7 @@ define void @v_shuffle_v2f32_v8f32__13_12(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10954,8 +10911,7 @@ define void @v_shuffle_v2f32_v8f32__13_12(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -11892,8 +11848,7 @@ define void @v_shuffle_v2f32_v8f32__1_14(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -11908,8 +11863,8 @@ define void @v_shuffle_v2f32_v8f32__1_14(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v1, v8
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -11999,8 +11954,7 @@ define void @v_shuffle_v2f32_v8f32__3_14(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v10
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -12015,8 +11969,8 @@ define void @v_shuffle_v2f32_v8f32__3_14(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v10
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -12106,8 +12060,7 @@ define void @v_shuffle_v2f32_v8f32__5_14(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v12
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -12122,8 +12075,8 @@ define void @v_shuffle_v2f32_v8f32__5_14(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v12
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -12213,8 +12166,7 @@ define void @v_shuffle_v2f32_v8f32__7_14(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v14
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -12229,8 +12181,8 @@ define void @v_shuffle_v2f32_v8f32__7_14(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v14
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll
index 632e8d2a32bad8..90db7643eb6b14 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll
@@ -171,15 +171,14 @@ define void @v_shuffle_v2i32_v2i32__3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -187,15 +186,15 @@ define void @v_shuffle_v2i32_v2i32__3_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -274,27 +273,24 @@ define void @v_shuffle_v2i32_v2i32__3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2i32_v2i32__3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v2i32_v2i32__3_2:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -447,27 +443,24 @@ define void @v_shuffle_v2i32_v2i32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2i32_v2i32__1_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v2i32_v2i32__1_0:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll
index fb6671ca787012..784ef6b362f579 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll
@@ -632,10 +632,9 @@ define void @v_shuffle_v2i32_v3i32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -645,10 +644,9 @@ define void @v_shuffle_v2i32_v3i32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -765,13 +763,12 @@ define void @v_shuffle_v2i32_v3i32__4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -786,9 +783,8 @@ define void @v_shuffle_v2i32_v3i32__4_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1480,10 +1476,9 @@ define void @v_shuffle_v2i32_v3i32__4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1493,10 +1488,9 @@ define void @v_shuffle_v2i32_v3i32__4_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll
index b4051228a443e8..b661ee63360ea3 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll
@@ -335,13 +335,12 @@ define void @v_shuffle_v2i32_v4i32__7_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -356,9 +355,8 @@ define void @v_shuffle_v2i32_v4i32__7_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v5
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -447,8 +445,7 @@ define void @v_shuffle_v2i32_v4i32__7_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -463,8 +460,8 @@ define void @v_shuffle_v2i32_v4i32__7_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -637,8 +634,7 @@ define void @v_shuffle_v2i32_v4i32__7_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -650,8 +646,7 @@ define void @v_shuffle_v2i32_v4i32__7_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -809,9 +804,8 @@ define void @v_shuffle_v2i32_v4i32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -822,9 +816,8 @@ define void @v_shuffle_v2i32_v4i32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -984,13 +977,12 @@ define void @v_shuffle_v2i32_v4i32__5_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1005,9 +997,8 @@ define void @v_shuffle_v2i32_v4i32__5_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1607,8 +1598,7 @@ define void @v_shuffle_v2i32_v4i32__3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1620,8 +1610,7 @@ define void @v_shuffle_v2i32_v4i32__3_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -1698,8 +1687,7 @@ define void @v_shuffle_v2i32_v4i32__5_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1714,8 +1702,8 @@ define void @v_shuffle_v2i32_v4i32__5_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -2331,9 +2319,8 @@ define void @v_shuffle_v2i32_v4i32__5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2344,9 +2331,8 @@ define void @v_shuffle_v2i32_v4i32__5_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -2898,8 +2884,7 @@ define void @v_shuffle_v2i32_v4i32__1_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -2914,8 +2899,8 @@ define void @v_shuffle_v2i32_v4i32__1_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -3005,8 +2990,7 @@ define void @v_shuffle_v2i32_v4i32__3_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3021,8 +3005,8 @@ define void @v_shuffle_v2i32_v4i32__3_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll
index 11d1b88a938f2e..428ccf207213c2 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll
@@ -659,13 +659,12 @@ define void @v_shuffle_v2i32_v8i32__15_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v9
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -680,9 +679,8 @@ define void @v_shuffle_v2i32_v8i32__15_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v9
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x i32> asm "; def $0", "=v"()
@@ -771,8 +769,7 @@ define void @v_shuffle_v2i32_v8i32__15_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v11
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -787,8 +784,8 @@ define void @v_shuffle_v2i32_v8i32__15_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v11
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -878,8 +875,7 @@ define void @v_shuffle_v2i32_v8i32__15_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v13
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -894,8 +890,8 @@ define void @v_shuffle_v2i32_v8i32__15_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: v_mov_b32_e32 v0, v13
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -985,8 +981,7 @@ define void @v_shuffle_v2i32_v8i32__15_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v15
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1001,8 +996,8 @@ define void @v_shuffle_v2i32_v8i32__15_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v15
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -1351,8 +1346,7 @@ define void @v_shuffle_v2i32_v8i32__15_14(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1364,8 +1358,7 @@ define void @v_shuffle_v2i32_v8i32__15_14(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -1523,9 +1516,8 @@ define void @v_shuffle_v2i32_v8i32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1536,9 +1528,8 @@ define void @v_shuffle_v2i32_v8i32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x i32> asm "; def $0", "=v"()
@@ -1870,13 +1861,12 @@ define void @v_shuffle_v2i32_v8i32__9_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1891,9 +1881,8 @@ define void @v_shuffle_v2i32_v8i32__9_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x i32> asm "; def $0", "=v"()
@@ -1978,13 +1967,12 @@ define void @v_shuffle_v2i32_v8i32__11_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1999,9 +1987,8 @@ define void @v_shuffle_v2i32_v8i32__11_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v5
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x i32> asm "; def $0", "=v"()
@@ -2086,13 +2073,12 @@ define void @v_shuffle_v2i32_v8i32__13_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v7
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2107,9 +2093,8 @@ define void @v_shuffle_v2i32_v8i32__13_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v7
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x i32> asm "; def $0", "=v"()
@@ -3089,8 +3074,7 @@ define void @v_shuffle_v2i32_v8i32__3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3102,8 +3086,7 @@ define void @v_shuffle_v2i32_v8i32__3_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -3352,8 +3335,7 @@ define void @v_shuffle_v2i32_v8i32__9_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3368,8 +3350,8 @@ define void @v_shuffle_v2i32_v8i32__9_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -3459,8 +3441,7 @@ define void @v_shuffle_v2i32_v8i32__11_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3475,8 +3456,8 @@ define void @v_shuffle_v2i32_v8i32__11_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -3566,8 +3547,7 @@ define void @v_shuffle_v2i32_v8i32__13_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v9
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3582,8 +3562,8 @@ define void @v_shuffle_v2i32_v8i32__13_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v9
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -4650,8 +4630,7 @@ define void @v_shuffle_v2i32_v8i32__5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -4663,8 +4642,7 @@ define void @v_shuffle_v2i32_v8i32__5_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -4827,8 +4805,7 @@ define void @v_shuffle_v2i32_v8i32__9_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -4843,8 +4820,8 @@ define void @v_shuffle_v2i32_v8i32__9_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -4934,8 +4911,7 @@ define void @v_shuffle_v2i32_v8i32__11_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v9
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -4950,8 +4926,8 @@ define void @v_shuffle_v2i32_v8i32__11_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: v_mov_b32_e32 v0, v9
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -5041,8 +5017,7 @@ define void @v_shuffle_v2i32_v8i32__13_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v11
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -5057,8 +5032,8 @@ define void @v_shuffle_v2i32_v8i32__13_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: v_mov_b32_e32 v0, v11
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -6211,8 +6186,7 @@ define void @v_shuffle_v2i32_v8i32__7_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6224,8 +6198,7 @@ define void @v_shuffle_v2i32_v8i32__7_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -6302,8 +6275,7 @@ define void @v_shuffle_v2i32_v8i32__9_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v9
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6318,8 +6290,8 @@ define void @v_shuffle_v2i32_v8i32__9_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v9
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -6409,8 +6381,7 @@ define void @v_shuffle_v2i32_v8i32__11_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v11
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6425,8 +6396,8 @@ define void @v_shuffle_v2i32_v8i32__11_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v11
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -6516,8 +6487,7 @@ define void @v_shuffle_v2i32_v8i32__13_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v13
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6532,8 +6502,8 @@ define void @v_shuffle_v2i32_v8i32__13_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v13
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -7689,9 +7659,8 @@ define void @v_shuffle_v2i32_v8i32__9_8(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7702,9 +7671,8 @@ define void @v_shuffle_v2i32_v8i32__9_8(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x i32> asm "; def $0", "=v"()
@@ -8816,8 +8784,7 @@ define void @v_shuffle_v2i32_v8i32__1_10(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -8832,8 +8799,8 @@ define void @v_shuffle_v2i32_v8i32__1_10(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -8923,8 +8890,7 @@ define void @v_shuffle_v2i32_v8i32__3_10(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -8939,8 +8905,8 @@ define void @v_shuffle_v2i32_v8i32__3_10(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -9030,8 +8996,7 @@ define void @v_shuffle_v2i32_v8i32__5_10(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -9046,8 +9011,8 @@ define void @v_shuffle_v2i32_v8i32__5_10(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v8
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -9137,8 +9102,7 @@ define void @v_shuffle_v2i32_v8i32__7_10(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v10
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -9153,8 +9117,8 @@ define void @v_shuffle_v2i32_v8i32__7_10(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v10
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -9315,8 +9279,7 @@ define void @v_shuffle_v2i32_v8i32__11_10(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -9328,8 +9291,7 @@ define void @v_shuffle_v2i32_v8i32__11_10(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10354,8 +10316,7 @@ define void @v_shuffle_v2i32_v8i32__1_12(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10370,8 +10331,8 @@ define void @v_shuffle_v2i32_v8i32__1_12(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10461,8 +10422,7 @@ define void @v_shuffle_v2i32_v8i32__3_12(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10477,8 +10437,8 @@ define void @v_shuffle_v2i32_v8i32__3_12(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v8
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10568,8 +10528,7 @@ define void @v_shuffle_v2i32_v8i32__5_12(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v10
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10584,8 +10543,8 @@ define void @v_shuffle_v2i32_v8i32__5_12(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v10
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10675,8 +10634,7 @@ define void @v_shuffle_v2i32_v8i32__7_12(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v12
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10691,8 +10649,8 @@ define void @v_shuffle_v2i32_v8i32__7_12(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v12
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10941,8 +10899,7 @@ define void @v_shuffle_v2i32_v8i32__13_12(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10954,8 +10911,7 @@ define void @v_shuffle_v2i32_v8i32__13_12(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -11892,8 +11848,7 @@ define void @v_shuffle_v2i32_v8i32__1_14(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -11908,8 +11863,8 @@ define void @v_shuffle_v2i32_v8i32__1_14(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v1, v8
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -11999,8 +11954,7 @@ define void @v_shuffle_v2i32_v8i32__3_14(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v10
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -12015,8 +11969,8 @@ define void @v_shuffle_v2i32_v8i32__3_14(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v10
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -12106,8 +12060,7 @@ define void @v_shuffle_v2i32_v8i32__5_14(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v12
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -12122,8 +12075,8 @@ define void @v_shuffle_v2i32_v8i32__5_14(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v12
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -12213,8 +12166,7 @@ define void @v_shuffle_v2i32_v8i32__7_14(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v14
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -12229,8 +12181,8 @@ define void @v_shuffle_v2i32_v8i32__7_14(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v14
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll
index 2cb50e0493ae0a..44818e7e197a2a 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll
@@ -171,15 +171,14 @@ define void @v_shuffle_v2p3_v2p3__3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -187,15 +186,15 @@ define void @v_shuffle_v2p3_v2p3__3_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -274,27 +273,24 @@ define void @v_shuffle_v2p3_v2p3__3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2p3_v2p3__3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v2p3_v2p3__3_2:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -447,27 +443,24 @@ define void @v_shuffle_v2p3_v2p3__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2p3_v2p3__1_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v2p3_v2p3__1_0:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll
index b92fa40a269996..e01ab1adc045b0 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll
@@ -632,10 +632,9 @@ define void @v_shuffle_v2p3_v3p3__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -645,10 +644,9 @@ define void @v_shuffle_v2p3_v3p3__1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -765,13 +763,12 @@ define void @v_shuffle_v2p3_v3p3__4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -786,9 +783,8 @@ define void @v_shuffle_v2p3_v3p3__4_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1480,10 +1476,9 @@ define void @v_shuffle_v2p3_v3p3__4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1493,10 +1488,9 @@ define void @v_shuffle_v2p3_v3p3__4_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll
index 8080c22d792198..79c65c554e8d78 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll
@@ -335,13 +335,12 @@ define void @v_shuffle_v2p3_v4p3__7_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -356,9 +355,8 @@ define void @v_shuffle_v2p3_v4p3__7_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v5
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -447,8 +445,7 @@ define void @v_shuffle_v2p3_v4p3__7_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -463,8 +460,8 @@ define void @v_shuffle_v2p3_v4p3__7_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -637,8 +634,7 @@ define void @v_shuffle_v2p3_v4p3__7_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -650,8 +646,7 @@ define void @v_shuffle_v2p3_v4p3__7_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -809,9 +804,8 @@ define void @v_shuffle_v2p3_v4p3__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -822,9 +816,8 @@ define void @v_shuffle_v2p3_v4p3__1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -984,13 +977,12 @@ define void @v_shuffle_v2p3_v4p3__5_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1005,9 +997,8 @@ define void @v_shuffle_v2p3_v4p3__5_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1607,8 +1598,7 @@ define void @v_shuffle_v2p3_v4p3__3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1620,8 +1610,7 @@ define void @v_shuffle_v2p3_v4p3__3_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -1698,8 +1687,7 @@ define void @v_shuffle_v2p3_v4p3__5_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1714,8 +1702,8 @@ define void @v_shuffle_v2p3_v4p3__5_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -2331,9 +2319,8 @@ define void @v_shuffle_v2p3_v4p3__5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2344,9 +2331,8 @@ define void @v_shuffle_v2p3_v4p3__5_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2898,8 +2884,7 @@ define void @v_shuffle_v2p3_v4p3__1_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -2914,8 +2899,8 @@ define void @v_shuffle_v2p3_v4p3__1_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -3005,8 +2990,7 @@ define void @v_shuffle_v2p3_v4p3__3_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3021,8 +3005,8 @@ define void @v_shuffle_v2p3_v4p3__3_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll
index 02a5800ce1896a..9fee242fa94044 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll
@@ -659,13 +659,12 @@ define void @v_shuffle_v2p3_v8p3__15_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v9
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -680,9 +679,8 @@ define void @v_shuffle_v2p3_v8p3__15_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v9
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -771,8 +769,7 @@ define void @v_shuffle_v2p3_v8p3__15_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v11
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -787,8 +784,8 @@ define void @v_shuffle_v2p3_v8p3__15_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v11
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -878,8 +875,7 @@ define void @v_shuffle_v2p3_v8p3__15_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v13
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -894,8 +890,8 @@ define void @v_shuffle_v2p3_v8p3__15_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: v_mov_b32_e32 v0, v13
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -985,8 +981,7 @@ define void @v_shuffle_v2p3_v8p3__15_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v15
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1001,8 +996,8 @@ define void @v_shuffle_v2p3_v8p3__15_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v15
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -1351,8 +1346,7 @@ define void @v_shuffle_v2p3_v8p3__15_14(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1364,8 +1358,7 @@ define void @v_shuffle_v2p3_v8p3__15_14(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -1523,9 +1516,8 @@ define void @v_shuffle_v2p3_v8p3__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1536,9 +1528,8 @@ define void @v_shuffle_v2p3_v8p3__1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1870,13 +1861,12 @@ define void @v_shuffle_v2p3_v8p3__9_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1891,9 +1881,8 @@ define void @v_shuffle_v2p3_v8p3__9_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1978,13 +1967,12 @@ define void @v_shuffle_v2p3_v8p3__11_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1999,9 +1987,8 @@ define void @v_shuffle_v2p3_v8p3__11_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v5
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2086,13 +2073,12 @@ define void @v_shuffle_v2p3_v8p3__13_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v7
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2107,9 +2093,8 @@ define void @v_shuffle_v2p3_v8p3__13_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v7
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3089,8 +3074,7 @@ define void @v_shuffle_v2p3_v8p3__3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3102,8 +3086,7 @@ define void @v_shuffle_v2p3_v8p3__3_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -3352,8 +3335,7 @@ define void @v_shuffle_v2p3_v8p3__9_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3368,8 +3350,8 @@ define void @v_shuffle_v2p3_v8p3__9_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -3459,8 +3441,7 @@ define void @v_shuffle_v2p3_v8p3__11_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3475,8 +3456,8 @@ define void @v_shuffle_v2p3_v8p3__11_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -3566,8 +3547,7 @@ define void @v_shuffle_v2p3_v8p3__13_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v9
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3582,8 +3562,8 @@ define void @v_shuffle_v2p3_v8p3__13_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v9
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -4650,8 +4630,7 @@ define void @v_shuffle_v2p3_v8p3__5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -4663,8 +4642,7 @@ define void @v_shuffle_v2p3_v8p3__5_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -4827,8 +4805,7 @@ define void @v_shuffle_v2p3_v8p3__9_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -4843,8 +4820,8 @@ define void @v_shuffle_v2p3_v8p3__9_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -4934,8 +4911,7 @@ define void @v_shuffle_v2p3_v8p3__11_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v9
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -4950,8 +4926,8 @@ define void @v_shuffle_v2p3_v8p3__11_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: v_mov_b32_e32 v0, v9
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -5041,8 +5017,7 @@ define void @v_shuffle_v2p3_v8p3__13_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v11
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -5057,8 +5032,8 @@ define void @v_shuffle_v2p3_v8p3__13_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: v_mov_b32_e32 v0, v11
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -6211,8 +6186,7 @@ define void @v_shuffle_v2p3_v8p3__7_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6224,8 +6198,7 @@ define void @v_shuffle_v2p3_v8p3__7_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -6302,8 +6275,7 @@ define void @v_shuffle_v2p3_v8p3__9_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v9
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6318,8 +6290,8 @@ define void @v_shuffle_v2p3_v8p3__9_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v9
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -6409,8 +6381,7 @@ define void @v_shuffle_v2p3_v8p3__11_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v11
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6425,8 +6396,8 @@ define void @v_shuffle_v2p3_v8p3__11_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v11
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -6516,8 +6487,7 @@ define void @v_shuffle_v2p3_v8p3__13_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v13
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6532,8 +6502,8 @@ define void @v_shuffle_v2p3_v8p3__13_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v13
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -7689,9 +7659,8 @@ define void @v_shuffle_v2p3_v8p3__9_8(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7702,9 +7671,8 @@ define void @v_shuffle_v2p3_v8p3__9_8(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -8816,8 +8784,7 @@ define void @v_shuffle_v2p3_v8p3__1_10(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -8832,8 +8799,8 @@ define void @v_shuffle_v2p3_v8p3__1_10(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -8923,8 +8890,7 @@ define void @v_shuffle_v2p3_v8p3__3_10(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -8939,8 +8905,8 @@ define void @v_shuffle_v2p3_v8p3__3_10(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -9030,8 +8996,7 @@ define void @v_shuffle_v2p3_v8p3__5_10(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -9046,8 +9011,8 @@ define void @v_shuffle_v2p3_v8p3__5_10(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v8
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -9137,8 +9102,7 @@ define void @v_shuffle_v2p3_v8p3__7_10(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v10
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -9153,8 +9117,8 @@ define void @v_shuffle_v2p3_v8p3__7_10(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v10
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -9315,8 +9279,7 @@ define void @v_shuffle_v2p3_v8p3__11_10(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -9328,8 +9291,7 @@ define void @v_shuffle_v2p3_v8p3__11_10(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10354,8 +10316,7 @@ define void @v_shuffle_v2p3_v8p3__1_12(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10370,8 +10331,8 @@ define void @v_shuffle_v2p3_v8p3__1_12(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10461,8 +10422,7 @@ define void @v_shuffle_v2p3_v8p3__3_12(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10477,8 +10437,8 @@ define void @v_shuffle_v2p3_v8p3__3_12(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v8
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10568,8 +10528,7 @@ define void @v_shuffle_v2p3_v8p3__5_12(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v10
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10584,8 +10543,8 @@ define void @v_shuffle_v2p3_v8p3__5_12(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v10
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10675,8 +10634,7 @@ define void @v_shuffle_v2p3_v8p3__7_12(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v12
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10691,8 +10649,8 @@ define void @v_shuffle_v2p3_v8p3__7_12(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v12
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -10941,8 +10899,7 @@ define void @v_shuffle_v2p3_v8p3__13_12(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -10954,8 +10911,7 @@ define void @v_shuffle_v2p3_v8p3__13_12(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:7]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v8, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -11892,8 +11848,7 @@ define void @v_shuffle_v2p3_v8p3__1_14(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:9]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -11908,8 +11863,8 @@ define void @v_shuffle_v2p3_v8p3__1_14(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:9]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v1, v8
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -11999,8 +11954,7 @@ define void @v_shuffle_v2p3_v8p3__3_14(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:11]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v10
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -12015,8 +11969,8 @@ define void @v_shuffle_v2p3_v8p3__3_14(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:11]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v10
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -12106,8 +12060,7 @@ define void @v_shuffle_v2p3_v8p3__5_14(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:13]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v12
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -12122,8 +12075,8 @@ define void @v_shuffle_v2p3_v8p3__5_14(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[6:13]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v12
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -12213,8 +12166,7 @@ define void @v_shuffle_v2p3_v8p3__7_14(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[8:15]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v14
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -12229,8 +12181,8 @@ define void @v_shuffle_v2p3_v8p3__7_14(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[8:15]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v14
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
index b87c969c5bbdf0..1851a34d0e5600 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
@@ -61,13 +61,10 @@ define void @v_shuffle_v4i64_v3i64__1_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v4i64_v3i64__1_u_u_u:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v4, v2
-; GFX900-NEXT: v_mov_b32_e32 v5, v3
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -75,13 +72,10 @@ define void @v_shuffle_v4i64_v3i64__1_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_u_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -89,13 +83,10 @@ define void @v_shuffle_v4i64_v3i64__1_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-LABEL: v_shuffle_v4i64_v3i64__1_u_u_u:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -113,10 +104,9 @@ define void @v_shuffle_v4i64_v3i64__2_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v2, v4
-; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -127,10 +117,9 @@ define void @v_shuffle_v4i64_v3i64__2_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -141,10 +130,9 @@ define void @v_shuffle_v4i64_v3i64__2_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v4
-; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -168,13 +156,10 @@ define void @v_shuffle_v4i64_v3i64__4_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v4i64_v3i64__4_u_u_u:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v4, v2
-; GFX900-NEXT: v_mov_b32_e32 v5, v3
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -182,13 +167,10 @@ define void @v_shuffle_v4i64_v3i64__4_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v3i64__4_u_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -196,13 +178,10 @@ define void @v_shuffle_v4i64_v3i64__4_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-LABEL: v_shuffle_v4i64_v3i64__4_u_u_u:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -221,10 +200,9 @@ define void @v_shuffle_v4i64_v3i64__5_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v2, v4
-; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -235,10 +213,9 @@ define void @v_shuffle_v4i64_v3i64__5_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -249,10 +226,9 @@ define void @v_shuffle_v4i64_v3i64__5_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v4
-; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -544,7 +520,6 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: v_mov_b32_e32 v2, v4
; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -558,7 +533,6 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -572,7 +546,6 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -875,9 +848,12 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_u(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_mov_b32_e32 v6, 0
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16
+; GFX900-NEXT: s_nop 0
; GFX900-NEXT: v_mov_b32_e32 v2, v4
; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -889,9 +865,12 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16
+; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -903,9 +882,12 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -2449,7 +2431,7 @@ define void @v_shuffle_v4i64_v3i64__u_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v4, v2
; GFX900-NEXT: v_mov_b32_e32 v5, v3
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2463,7 +2445,7 @@ define void @v_shuffle_v4i64_v3i64__u_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2477,7 +2459,7 @@ define void @v_shuffle_v4i64_v3i64__u_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v4, v2
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -2650,7 +2632,7 @@ define void @v_shuffle_v4i64_v3i64__3_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v4, v2
; GFX900-NEXT: v_mov_b32_e32 v5, v3
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2664,7 +2646,7 @@ define void @v_shuffle_v4i64_v3i64__3_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2678,7 +2660,7 @@ define void @v_shuffle_v4i64_v3i64__3_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v4, v2
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -4691,13 +4673,10 @@ define void @v_shuffle_v4i64_v3i64__1_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v4i64_v3i64__1_3_3_3:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v4, v2
-; GFX900-NEXT: v_mov_b32_e32 v5, v3
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -4705,13 +4684,10 @@ define void @v_shuffle_v4i64_v3i64__1_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_3_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -4719,13 +4695,10 @@ define void @v_shuffle_v4i64_v3i64__1_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-LABEL: v_shuffle_v4i64_v3i64__1_3_3_3:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -4743,10 +4716,9 @@ define void @v_shuffle_v4i64_v3i64__2_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v2, v4
-; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -4757,10 +4729,9 @@ define void @v_shuffle_v4i64_v3i64__2_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4771,10 +4742,9 @@ define void @v_shuffle_v4i64_v3i64__2_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v4
-; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -5609,7 +5579,7 @@ define void @v_shuffle_v4i64_v3i64__u_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v4, v2
; GFX900-NEXT: v_mov_b32_e32 v5, v3
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5623,7 +5593,7 @@ define void @v_shuffle_v4i64_v3i64__u_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5637,7 +5607,7 @@ define void @v_shuffle_v4i64_v3i64__u_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v4, v2
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -7046,8 +7016,10 @@ define void @v_shuffle_v4i64_v3i64__5_u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: v_mov_b32_e32 v2, v4
; GFX900-NEXT: v_mov_b32_e32 v3, v5
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -7060,8 +7032,10 @@ define void @v_shuffle_v4i64_v3i64__5_u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7074,8 +7048,10 @@ define void @v_shuffle_v4i64_v3i64__5_u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v5
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -7408,13 +7384,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_u_5:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, 0
+; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
+; GFX900-NEXT: s_nop 0
; GFX900-NEXT: v_mov_b32_e32 v2, v4
; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -7422,13 +7399,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_u_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
+; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -7436,13 +7414,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_u_5:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -7807,22 +7786,44 @@ define void @s_shuffle_v4i64_v3i64__0_u_u_u() {
}
define void @s_shuffle_v4i64_v3i64__1_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__1_u_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
-; GFX9-NEXT: s_mov_b32 s12, s10
-; GFX9-NEXT: s_mov_b32 s13, s11
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_u_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s6
+; GFX900-NEXT: s_mov_b32 s9, s7
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_u_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s6
+; GFX90A-NEXT: s_mov_b32 s9, s7
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__1_u_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s2
+; GFX940-NEXT: s_mov_b32 s9, s3
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -7830,59 +7831,99 @@ define void @s_shuffle_v4i64_v3i64__1_u_u_u() {
}
define void @s_shuffle_v4i64_v3i64__2_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__2_u_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
- %vec0 = call <3 x i64> asm "; def $0", "=s"()
- %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
- call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
- ret void
-}
-
-define void @s_shuffle_v4i64_v3i64__3_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_u_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
- %vec0 = call <3 x i64> asm "; def $0", "=s"()
- %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
- call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
- ret void
-}
-
-define void @s_shuffle_v4i64_v3i64__4_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__4_u_u_u:
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_u_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_u_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__2_u_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+ %vec0 = call <3 x i64> asm "; def $0", "=s"()
+ %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
+ call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+ ret void
+}
+
+define void @s_shuffle_v4i64_v3i64__3_u_u_u() {
+; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_u_u_u:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
-; GFX9-NEXT: s_mov_b32 s12, s10
-; GFX9-NEXT: s_mov_b32 s13, s11
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
-; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s[8:15]
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec0 = call <3 x i64> asm "; def $0", "=s"()
+ %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
+ call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+ ret void
+}
+
+define void @s_shuffle_v4i64_v3i64__4_u_u_u() {
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_u_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s6
+; GFX900-NEXT: s_mov_b32 s9, s7
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_u_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s6
+; GFX90A-NEXT: s_mov_b32 s9, s7
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__4_u_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s2
+; GFX940-NEXT: s_mov_b32 s9, s3
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%vec1 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 4, i32 poison, i32 poison, i32 poison>
@@ -7891,22 +7932,40 @@ define void @s_shuffle_v4i64_v3i64__4_u_u_u() {
}
define void @s_shuffle_v4i64_v3i64__5_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_u_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_u_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%vec1 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 poison, i32 poison, i32 poison>
@@ -8150,22 +8209,50 @@ define void @s_shuffle_v4i64_v3i64__5_4_u_u() {
}
define void @s_shuffle_v4i64_v3i64__5_5_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[8:13]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s12
+; GFX900-NEXT: s_mov_b32 s9, s13
+; GFX900-NEXT: s_mov_b32 s10, s12
+; GFX900-NEXT: s_mov_b32 s11, s13
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[8:13]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s12
+; GFX90A-NEXT: s_mov_b32 s9, s13
+; GFX90A-NEXT: s_mov_b32 s10, s12
+; GFX90A-NEXT: s_mov_b32 s11, s13
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: s_mov_b32 s10, s4
+; GFX940-NEXT: s_mov_b32 s11, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%vec1 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 5, i32 poison, i32 poison>
@@ -8491,8 +8578,6 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_u() {
; GFX9-NEXT: s_mov_b32 s9, s13
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s[8:15]
; GFX9-NEXT: ;;#ASMEND
@@ -8854,22 +8939,56 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_5() {
}
define void @s_shuffle_v4i64_v3i64__u_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s10, s8
-; GFX9-NEXT: s_mov_b32 s11, s9
-; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_mov_b32 s15, s9
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s10, s4
+; GFX900-NEXT: s_mov_b32 s11, s5
+; GFX900-NEXT: s_mov_b32 s12, s4
+; GFX900-NEXT: s_mov_b32 s13, s5
+; GFX900-NEXT: s_mov_b32 s14, s4
+; GFX900-NEXT: s_mov_b32 s15, s5
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s10, s4
+; GFX90A-NEXT: s_mov_b32 s11, s5
+; GFX90A-NEXT: s_mov_b32 s12, s4
+; GFX90A-NEXT: s_mov_b32 s13, s5
+; GFX90A-NEXT: s_mov_b32 s14, s4
+; GFX90A-NEXT: s_mov_b32 s15, s5
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s10, s0
+; GFX940-NEXT: s_mov_b32 s11, s1
+; GFX940-NEXT: s_mov_b32 s12, s0
+; GFX940-NEXT: s_mov_b32 s13, s1
+; GFX940-NEXT: s_mov_b32 s14, s0
+; GFX940-NEXT: s_mov_b32 s15, s1
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 poison, i32 0, i32 0, i32 0>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -9022,22 +9141,56 @@ define void @s_shuffle_v4i64_v3i64__2_0_0_0() {
}
define void @s_shuffle_v4i64_v3i64__3_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s10, s8
-; GFX9-NEXT: s_mov_b32 s11, s9
-; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_mov_b32 s15, s9
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s10, s4
+; GFX900-NEXT: s_mov_b32 s11, s5
+; GFX900-NEXT: s_mov_b32 s12, s4
+; GFX900-NEXT: s_mov_b32 s13, s5
+; GFX900-NEXT: s_mov_b32 s14, s4
+; GFX900-NEXT: s_mov_b32 s15, s5
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s10, s4
+; GFX90A-NEXT: s_mov_b32 s11, s5
+; GFX90A-NEXT: s_mov_b32 s12, s4
+; GFX90A-NEXT: s_mov_b32 s13, s5
+; GFX90A-NEXT: s_mov_b32 s14, s4
+; GFX90A-NEXT: s_mov_b32 s15, s5
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s10, s0
+; GFX940-NEXT: s_mov_b32 s11, s1
+; GFX940-NEXT: s_mov_b32 s12, s0
+; GFX940-NEXT: s_mov_b32 s13, s1
+; GFX940-NEXT: s_mov_b32 s14, s0
+; GFX940-NEXT: s_mov_b32 s15, s1
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -9964,8 +10117,6 @@ define void @s_shuffle_v4i64_v3i64__u_1_1_1() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s12, s10
; GFX9-NEXT: s_mov_b32 s13, s11
; GFX9-NEXT: s_mov_b32 s14, s10
@@ -10054,8 +10205,6 @@ define void @s_shuffle_v4i64_v3i64__3_1_1_1() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s12, s10
; GFX9-NEXT: s_mov_b32 s13, s11
; GFX9-NEXT: s_mov_b32 s14, s10
@@ -10968,8 +11117,6 @@ define void @s_shuffle_v4i64_v3i64__u_2_2_2() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: s_mov_b32 s14, s12
@@ -11058,8 +11205,6 @@ define void @s_shuffle_v4i64_v3i64__3_2_2_2() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: s_mov_b32 s14, s12
@@ -11992,22 +12137,44 @@ define void @s_shuffle_v4i64_v3i64__0_3_3_3() {
}
define void @s_shuffle_v4i64_v3i64__1_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
-; GFX9-NEXT: s_mov_b32 s12, s10
-; GFX9-NEXT: s_mov_b32 s13, s11
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s6
+; GFX900-NEXT: s_mov_b32 s9, s7
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s6
+; GFX90A-NEXT: s_mov_b32 s9, s7
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s2
+; GFX940-NEXT: s_mov_b32 s9, s3
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 1, i32 3, i32 3, i32 3>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -12015,22 +12182,40 @@ define void @s_shuffle_v4i64_v3i64__1_3_3_3() {
}
define void @s_shuffle_v4i64_v3i64__2_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -12898,8 +13083,6 @@ define void @s_shuffle_v4i64_v3i64__u_4_4_4() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s12, s10
; GFX9-NEXT: s_mov_b32 s13, s11
; GFX9-NEXT: s_mov_b32 s14, s10
@@ -13871,8 +14054,6 @@ define void @s_shuffle_v4i64_v3i64__u_5_5_5() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: s_mov_b32 s14, s12
@@ -14144,8 +14325,6 @@ define void @s_shuffle_v4i64_v3i64__5_u_5_5() {
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_mov_b32 s8, s12
; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: s_mov_b32 s14, s12
; GFX9-NEXT: s_mov_b32 s15, s13
; GFX9-NEXT: ;;#ASMSTART
@@ -14447,22 +14626,56 @@ define void @s_shuffle_v4i64_v3i64__5_4_5_5() {
}
define void @s_shuffle_v4i64_v3i64__5_5_u_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[8:13]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s12
+; GFX900-NEXT: s_mov_b32 s9, s13
+; GFX900-NEXT: s_mov_b32 s10, s12
+; GFX900-NEXT: s_mov_b32 s11, s13
+; GFX900-NEXT: s_mov_b32 s14, s12
+; GFX900-NEXT: s_mov_b32 s15, s13
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[8:13]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s12
+; GFX90A-NEXT: s_mov_b32 s9, s13
+; GFX90A-NEXT: s_mov_b32 s10, s12
+; GFX90A-NEXT: s_mov_b32 s11, s13
+; GFX90A-NEXT: s_mov_b32 s14, s12
+; GFX90A-NEXT: s_mov_b32 s15, s13
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: s_mov_b32 s10, s4
+; GFX940-NEXT: s_mov_b32 s11, s5
+; GFX940-NEXT: s_mov_b32 s14, s4
+; GFX940-NEXT: s_mov_b32 s15, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=s"()
%vec1 = call <3 x i64> asm "; def $0", "=s"()
%shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 5, i32 poison, i32 5>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
index 2b46616c87f0dd..7a509ffb8c1591 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
@@ -61,13 +61,10 @@ define void @v_shuffle_v4p0_v3p0__1_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v4p0_v3p0__1_u_u_u:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v4, v2
-; GFX900-NEXT: v_mov_b32_e32 v5, v3
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -75,13 +72,10 @@ define void @v_shuffle_v4p0_v3p0__1_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_u_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -89,13 +83,10 @@ define void @v_shuffle_v4p0_v3p0__1_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-LABEL: v_shuffle_v4p0_v3p0__1_u_u_u:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -113,10 +104,9 @@ define void @v_shuffle_v4p0_v3p0__2_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v2, v4
-; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -127,10 +117,9 @@ define void @v_shuffle_v4p0_v3p0__2_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -141,10 +130,9 @@ define void @v_shuffle_v4p0_v3p0__2_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v4
-; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -168,13 +156,10 @@ define void @v_shuffle_v4p0_v3p0__4_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v4p0_v3p0__4_u_u_u:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v4, v2
-; GFX900-NEXT: v_mov_b32_e32 v5, v3
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -182,13 +167,10 @@ define void @v_shuffle_v4p0_v3p0__4_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v3p0__4_u_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -196,13 +178,10 @@ define void @v_shuffle_v4p0_v3p0__4_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-LABEL: v_shuffle_v4p0_v3p0__4_u_u_u:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -221,10 +200,9 @@ define void @v_shuffle_v4p0_v3p0__5_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v2, v4
-; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -235,10 +213,9 @@ define void @v_shuffle_v4p0_v3p0__5_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -249,10 +226,9 @@ define void @v_shuffle_v4p0_v3p0__5_u_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v4
-; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -544,7 +520,6 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_u(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: v_mov_b32_e32 v2, v4
; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -558,7 +533,6 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -572,7 +546,6 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -875,9 +848,12 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_u(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_mov_b32_e32 v6, 0
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16
+; GFX900-NEXT: s_nop 0
; GFX900-NEXT: v_mov_b32_e32 v2, v4
; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -889,9 +865,12 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16
+; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -903,9 +882,12 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -2449,7 +2431,7 @@ define void @v_shuffle_v4p0_v3p0__u_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v4, v2
; GFX900-NEXT: v_mov_b32_e32 v5, v3
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2463,7 +2445,7 @@ define void @v_shuffle_v4p0_v3p0__u_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2477,7 +2459,7 @@ define void @v_shuffle_v4p0_v3p0__u_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v4, v2
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -2650,7 +2632,7 @@ define void @v_shuffle_v4p0_v3p0__3_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v4, v2
; GFX900-NEXT: v_mov_b32_e32 v5, v3
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -2664,7 +2646,7 @@ define void @v_shuffle_v4p0_v3p0__3_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2678,7 +2660,7 @@ define void @v_shuffle_v4p0_v3p0__3_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v4, v2
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -4691,13 +4673,10 @@ define void @v_shuffle_v4p0_v3p0__1_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v4p0_v3p0__1_3_3_3:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v4, v2
-; GFX900-NEXT: v_mov_b32_e32 v5, v3
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -4705,13 +4684,10 @@ define void @v_shuffle_v4p0_v3p0__1_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_3_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -4719,13 +4695,10 @@ define void @v_shuffle_v4p0_v3p0__1_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-LABEL: v_shuffle_v4p0_v3p0__1_3_3_3:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -4743,10 +4716,9 @@ define void @v_shuffle_v4p0_v3p0__2_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_mov_b32_e32 v6, 0
-; GFX900-NEXT: v_mov_b32_e32 v2, v4
-; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -4757,10 +4729,9 @@ define void @v_shuffle_v4p0_v3p0__2_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4771,10 +4742,9 @@ define void @v_shuffle_v4p0_v3p0__2_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v6, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v4
-; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -5609,7 +5579,7 @@ define void @v_shuffle_v4p0_v3p0__u_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v4, v2
; GFX900-NEXT: v_mov_b32_e32 v5, v3
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -5623,7 +5593,7 @@ define void @v_shuffle_v4p0_v3p0__u_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5637,7 +5607,7 @@ define void @v_shuffle_v4p0_v3p0__u_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v4, v2
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -7046,8 +7016,10 @@ define void @v_shuffle_v4p0_v3p0__5_u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: v_mov_b32_e32 v2, v4
; GFX900-NEXT: v_mov_b32_e32 v3, v5
+; GFX900-NEXT: v_mov_b32_e32 v0, v4
+; GFX900-NEXT: v_mov_b32_e32 v1, v5
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -7060,8 +7032,10 @@ define void @v_shuffle_v4p0_v3p0__5_u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7074,8 +7048,10 @@ define void @v_shuffle_v4p0_v3p0__5_u_5_5(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v5
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -7408,13 +7384,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_u_5:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:5]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v6, 0
+; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
+; GFX900-NEXT: s_nop 0
; GFX900-NEXT: v_mov_b32_e32 v2, v4
; GFX900-NEXT: v_mov_b32_e32 v3, v5
-; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@@ -7422,13 +7399,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_u_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
+; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -7436,13 +7414,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_u_5:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v6, 0
+; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_mov_b32_e32 v2, v4
; GFX940-NEXT: v_mov_b32_e32 v3, v5
-; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -7807,22 +7786,44 @@ define void @s_shuffle_v4p0_v3p0__0_u_u_u() {
}
define void @s_shuffle_v4p0_v3p0__1_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__1_u_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
-; GFX9-NEXT: s_mov_b32 s12, s10
-; GFX9-NEXT: s_mov_b32 s13, s11
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_u_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s6
+; GFX900-NEXT: s_mov_b32 s9, s7
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_u_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s6
+; GFX90A-NEXT: s_mov_b32 s9, s7
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__1_u_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s2
+; GFX940-NEXT: s_mov_b32 s9, s3
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -7830,59 +7831,99 @@ define void @s_shuffle_v4p0_v3p0__1_u_u_u() {
}
define void @s_shuffle_v4p0_v3p0__2_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__2_u_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
- %vec0 = call <3 x ptr> asm "; def $0", "=s"()
- %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
- call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
- ret void
-}
-
-define void @s_shuffle_v4p0_v3p0__3_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_u_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
- %vec0 = call <3 x ptr> asm "; def $0", "=s"()
- %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
- call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
- ret void
-}
-
-define void @s_shuffle_v4p0_v3p0__4_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__4_u_u_u:
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_u_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_u_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__2_u_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+ %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+ %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
+ call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+ ret void
+}
+
+define void @s_shuffle_v4p0_v3p0__3_u_u_u() {
+; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_u_u_u:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
-; GFX9-NEXT: s_mov_b32 s12, s10
-; GFX9-NEXT: s_mov_b32 s13, s11
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
-; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s[8:15]
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+ %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
+ call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+ ret void
+}
+
+define void @s_shuffle_v4p0_v3p0__4_u_u_u() {
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_u_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s6
+; GFX900-NEXT: s_mov_b32 s9, s7
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_u_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s6
+; GFX90A-NEXT: s_mov_b32 s9, s7
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__4_u_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s2
+; GFX940-NEXT: s_mov_b32 s9, s3
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%vec1 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 4, i32 poison, i32 poison, i32 poison>
@@ -7891,22 +7932,40 @@ define void @s_shuffle_v4p0_v3p0__4_u_u_u() {
}
define void @s_shuffle_v4p0_v3p0__5_u_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_u_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_u_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%vec1 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 poison, i32 poison, i32 poison>
@@ -8150,22 +8209,50 @@ define void @s_shuffle_v4p0_v3p0__5_4_u_u() {
}
define void @s_shuffle_v4p0_v3p0__5_5_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[8:13]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s12
+; GFX900-NEXT: s_mov_b32 s9, s13
+; GFX900-NEXT: s_mov_b32 s10, s12
+; GFX900-NEXT: s_mov_b32 s11, s13
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[8:13]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s12
+; GFX90A-NEXT: s_mov_b32 s9, s13
+; GFX90A-NEXT: s_mov_b32 s10, s12
+; GFX90A-NEXT: s_mov_b32 s11, s13
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: s_mov_b32 s10, s4
+; GFX940-NEXT: s_mov_b32 s11, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%vec1 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 5, i32 poison, i32 poison>
@@ -8491,8 +8578,6 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_u() {
; GFX9-NEXT: s_mov_b32 s9, s13
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s[8:15]
; GFX9-NEXT: ;;#ASMEND
@@ -8854,22 +8939,56 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_5() {
}
define void @s_shuffle_v4p0_v3p0__u_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s10, s8
-; GFX9-NEXT: s_mov_b32 s11, s9
-; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_mov_b32 s15, s9
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s10, s4
+; GFX900-NEXT: s_mov_b32 s11, s5
+; GFX900-NEXT: s_mov_b32 s12, s4
+; GFX900-NEXT: s_mov_b32 s13, s5
+; GFX900-NEXT: s_mov_b32 s14, s4
+; GFX900-NEXT: s_mov_b32 s15, s5
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s10, s4
+; GFX90A-NEXT: s_mov_b32 s11, s5
+; GFX90A-NEXT: s_mov_b32 s12, s4
+; GFX90A-NEXT: s_mov_b32 s13, s5
+; GFX90A-NEXT: s_mov_b32 s14, s4
+; GFX90A-NEXT: s_mov_b32 s15, s5
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s10, s0
+; GFX940-NEXT: s_mov_b32 s11, s1
+; GFX940-NEXT: s_mov_b32 s12, s0
+; GFX940-NEXT: s_mov_b32 s13, s1
+; GFX940-NEXT: s_mov_b32 s14, s0
+; GFX940-NEXT: s_mov_b32 s15, s1
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 poison, i32 0, i32 0, i32 0>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -9022,22 +9141,56 @@ define void @s_shuffle_v4p0_v3p0__2_0_0_0() {
}
define void @s_shuffle_v4p0_v3p0__3_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s10, s8
-; GFX9-NEXT: s_mov_b32 s11, s9
-; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_mov_b32 s15, s9
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s10, s4
+; GFX900-NEXT: s_mov_b32 s11, s5
+; GFX900-NEXT: s_mov_b32 s12, s4
+; GFX900-NEXT: s_mov_b32 s13, s5
+; GFX900-NEXT: s_mov_b32 s14, s4
+; GFX900-NEXT: s_mov_b32 s15, s5
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s10, s4
+; GFX90A-NEXT: s_mov_b32 s11, s5
+; GFX90A-NEXT: s_mov_b32 s12, s4
+; GFX90A-NEXT: s_mov_b32 s13, s5
+; GFX90A-NEXT: s_mov_b32 s14, s4
+; GFX90A-NEXT: s_mov_b32 s15, s5
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s10, s0
+; GFX940-NEXT: s_mov_b32 s11, s1
+; GFX940-NEXT: s_mov_b32 s12, s0
+; GFX940-NEXT: s_mov_b32 s13, s1
+; GFX940-NEXT: s_mov_b32 s14, s0
+; GFX940-NEXT: s_mov_b32 s15, s1
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -9964,8 +10117,6 @@ define void @s_shuffle_v4p0_v3p0__u_1_1_1() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s12, s10
; GFX9-NEXT: s_mov_b32 s13, s11
; GFX9-NEXT: s_mov_b32 s14, s10
@@ -10054,8 +10205,6 @@ define void @s_shuffle_v4p0_v3p0__3_1_1_1() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s12, s10
; GFX9-NEXT: s_mov_b32 s13, s11
; GFX9-NEXT: s_mov_b32 s14, s10
@@ -10968,8 +11117,6 @@ define void @s_shuffle_v4p0_v3p0__u_2_2_2() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: s_mov_b32 s14, s12
@@ -11058,8 +11205,6 @@ define void @s_shuffle_v4p0_v3p0__3_2_2_2() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: s_mov_b32 s14, s12
@@ -11992,22 +12137,44 @@ define void @s_shuffle_v4p0_v3p0__0_3_3_3() {
}
define void @s_shuffle_v4p0_v3p0__1_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
-; GFX9-NEXT: s_mov_b32 s12, s10
-; GFX9-NEXT: s_mov_b32 s13, s11
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s6
+; GFX900-NEXT: s_mov_b32 s9, s7
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s6
+; GFX90A-NEXT: s_mov_b32 s9, s7
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s2
+; GFX940-NEXT: s_mov_b32 s9, s3
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 1, i32 3, i32 3, i32 3>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -12015,22 +12182,40 @@ define void @s_shuffle_v4p0_v3p0__1_3_3_3() {
}
define void @s_shuffle_v4p0_v3p0__2_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[4:9]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[4:9]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -12898,8 +13083,6 @@ define void @s_shuffle_v4p0_v3p0__u_4_4_4() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s10
-; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s12, s10
; GFX9-NEXT: s_mov_b32 s13, s11
; GFX9-NEXT: s_mov_b32 s14, s10
@@ -13871,8 +14054,6 @@ define void @s_shuffle_v4p0_v3p0__u_5_5_5() {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; def s[8:13]
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: s_mov_b32 s14, s12
@@ -14144,8 +14325,6 @@ define void @s_shuffle_v4p0_v3p0__5_u_5_5() {
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_mov_b32 s8, s12
; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: s_mov_b32 s14, s12
; GFX9-NEXT: s_mov_b32 s15, s13
; GFX9-NEXT: ;;#ASMSTART
@@ -14447,22 +14626,56 @@ define void @s_shuffle_v4p0_v3p0__5_4_5_5() {
}
define void @s_shuffle_v4p0_v3p0__5_5_u_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; def s[8:13]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_mov_b32 s8, s12
-; GFX9-NEXT: s_mov_b32 s9, s13
-; GFX9-NEXT: s_mov_b32 s10, s12
-; GFX9-NEXT: s_mov_b32 s11, s13
-; GFX9-NEXT: s_mov_b32 s14, s12
-; GFX9-NEXT: s_mov_b32 s15, s13
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s[8:15]
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; def s[8:13]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s8, s12
+; GFX900-NEXT: s_mov_b32 s9, s13
+; GFX900-NEXT: s_mov_b32 s10, s12
+; GFX900-NEXT: s_mov_b32 s11, s13
+; GFX900-NEXT: s_mov_b32 s14, s12
+; GFX900-NEXT: s_mov_b32 s15, s13
+; GFX900-NEXT: ;;#ASMSTART
+; GFX900-NEXT: ; use s[8:15]
+; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def s[8:13]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b32 s8, s12
+; GFX90A-NEXT: s_mov_b32 s9, s13
+; GFX90A-NEXT: s_mov_b32 s10, s12
+; GFX90A-NEXT: s_mov_b32 s11, s13
+; GFX90A-NEXT: s_mov_b32 s14, s12
+; GFX90A-NEXT: s_mov_b32 s15, s13
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use s[8:15]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; def s[0:5]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_mov_b32 s8, s4
+; GFX940-NEXT: s_mov_b32 s9, s5
+; GFX940-NEXT: s_mov_b32 s10, s4
+; GFX940-NEXT: s_mov_b32 s11, s5
+; GFX940-NEXT: s_mov_b32 s14, s4
+; GFX940-NEXT: s_mov_b32 s15, s5
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use s[8:15]
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=s"()
%vec1 = call <3 x ptr> asm "; def $0", "=s"()
%shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 5, i32 poison, i32 5>
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index e7ae9d831424cc..b85bd4c6346684 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -4942,78 +4942,78 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
;
; GFX940-LABEL: fma_shuffle_v2bf16:
; GFX940: ; %bb.0: ; %entry
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
; GFX940-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX940-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX940-NEXT: s_movk_i32 s2, 0x7fff
; GFX940-NEXT: s_mov_b32 s3, 0x7060302
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
-; GFX940-NEXT: global_load_dwordx2 v[2:3], v6, s[8:9]
+; GFX940-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9]
+; GFX940-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
; GFX940-NEXT: global_load_dwordx2 v[4:5], v6, s[10:11]
; GFX940-NEXT: s_waitcnt vmcnt(2)
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX940-NEXT: s_waitcnt vmcnt(1)
-; GFX940-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX940-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX940-NEXT: v_and_b32_e32 v11, 0xffff0000, v1
-; GFX940-NEXT: v_lshlrev_b32_e32 v12, 16, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX940-NEXT: v_fmac_f32_e32 v7, v8, v9
-; GFX940-NEXT: v_fmac_f32_e32 v0, v8, v4
-; GFX940-NEXT: v_fmac_f32_e32 v1, v12, v4
-; GFX940-NEXT: v_bfe_u32 v4, v7, 16, 1
-; GFX940-NEXT: v_fmac_f32_e32 v11, v12, v9
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX940-NEXT: v_bfe_u32 v9, v0, 16, 1
-; GFX940-NEXT: v_add3_u32 v4, v4, v7, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; GFX940-NEXT: v_or_b32_e32 v12, 0x400000, v0
-; GFX940-NEXT: v_bfe_u32 v13, v11, 16, 1
-; GFX940-NEXT: v_add3_u32 v9, v9, v0, s2
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; GFX940-NEXT: v_bfe_u32 v15, v1, 16, 1
-; GFX940-NEXT: v_add3_u32 v13, v13, v11, s2
-; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX940-NEXT: v_and_b32_e32 v12, 0xffff0000, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX940-NEXT: v_fmac_f32_e32 v8, v7, v9
+; GFX940-NEXT: v_fmac_f32_e32 v2, v7, v4
+; GFX940-NEXT: v_fmac_f32_e32 v3, v11, v4
+; GFX940-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX940-NEXT: v_fmac_f32_e32 v12, v11, v9
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX940-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX940-NEXT: v_add3_u32 v4, v4, v8, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v2
+; GFX940-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX940-NEXT: v_add3_u32 v9, v9, v2, s2
+; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX940-NEXT: v_bfe_u32 v15, v3, 16, 1
+; GFX940-NEXT: v_add3_u32 v13, v13, v12, s2
+; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX940-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX940-NEXT: v_or_b32_e32 v16, 0x400000, v1
-; GFX940-NEXT: v_add3_u32 v15, v15, v1, s2
+; GFX940-NEXT: v_or_b32_e32 v16, 0x400000, v3
+; GFX940-NEXT: v_add3_u32 v15, v15, v3, s2
; GFX940-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX940-NEXT: v_cndmask_b32_e32 v1, v15, v16, vcc
+; GFX940-NEXT: v_cndmask_b32_e32 v3, v15, v16, vcc
; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX940-NEXT: v_fmac_f32_e32 v0, v2, v10
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX940-NEXT: v_fmac_f32_e32 v2, v0, v10
; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX940-NEXT: v_fmac_f32_e32 v4, v2, v5
-; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX940-NEXT: v_fmac_f32_e32 v1, v3, v10
-; GFX940-NEXT: v_fmac_f32_e32 v7, v3, v5
-; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX940-NEXT: v_fmac_f32_e32 v4, v0, v5
+; GFX940-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX940-NEXT: v_fmac_f32_e32 v3, v1, v10
+; GFX940-NEXT: v_fmac_f32_e32 v7, v1, v5
+; GFX940-NEXT: v_or_b32_e32 v1, 0x400000, v2
; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX940-NEXT: v_add3_u32 v2, v2, v0, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX940-NEXT: v_add3_u32 v0, v0, v2, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1
; GFX940-NEXT: v_add3_u32 v5, v5, v4, s2
-; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v1
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3
; GFX940-NEXT: v_bfe_u32 v11, v7, 16, 1
-; GFX940-NEXT: v_add3_u32 v9, v9, v1, s2
+; GFX940-NEXT: v_add3_u32 v9, v9, v3, s2
; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX940-NEXT: v_or_b32_e32 v12, 0x400000, v7
; GFX940-NEXT: v_add3_u32 v11, v11, v7, s2
; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
index 6633cec659d8e5..39af91b81110d0 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
@@ -70,8 +70,7 @@ define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, dou
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; CHECK-NEXT: s_mov_b64 s[6:7], exec
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s8
-; CHECK-NEXT: v_mov_b32_e32 v1, s9
+; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
More information about the llvm-branch-commits
mailing list