[llvm] [AMDGPU] Legalize 64bit elements for BUILD_VECTOR on gfx942 (PR #145052)

Janek van Oirschot via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 31 09:15:53 PDT 2025


https://github.com/JanekvO updated https://github.com/llvm/llvm-project/pull/145052

>From f7cd027e7e2165d4864a84b7a3a5f81b9cb85fd1 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <Janek.vanOirschot at amd.com>
Date: Fri, 20 Jun 2025 03:24:29 -0700
Subject: [PATCH 1/2] [AMDGPU] Legalize 64bit elements for BUILD_VECTOR on
 gfx942

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |   22 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  166 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |    1 +
 llvm/test/CodeGen/AMDGPU/bf16-conversions.ll  |   16 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              |  882 +-
 .../AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll   |  239 +-
 .../AMDGPU/buffer-atomic-fadd.f32-rtn.ll      |  251 +-
 .../CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll  |  832 +-
 .../AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll |  239 +-
 .../AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll    |  251 +-
 llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll   |   38 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch.ll      |   48 +-
 llvm/test/CodeGen/AMDGPU/fmaximum3.ll         |    2 +-
 llvm/test/CodeGen/AMDGPU/fminimum3.ll         |    2 +-
 llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll  |   62 +-
 llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll        |   16 +-
 llvm/test/CodeGen/AMDGPU/literal64.ll         |    6 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll |  371 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll  |  175 +-
 llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll  |  590 +-
 llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll  |  590 +-
 .../CodeGen/AMDGPU/masked-load-vectortypes.ll |   75 +-
 llvm/test/CodeGen/AMDGPU/maximumnum.ll        | 1461 ++--
 llvm/test/CodeGen/AMDGPU/mfma-loop.ll         |   65 +-
 llvm/test/CodeGen/AMDGPU/minimumnum.ll        | 1461 ++--
 llvm/test/CodeGen/AMDGPU/mul.ll               |   27 +-
 .../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll     |   18 +-
 llvm/test/CodeGen/AMDGPU/packed-fp32.ll       |  710 +-
 llvm/test/CodeGen/AMDGPU/preload-kernargs.ll  |    8 +-
 .../test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll |   11 +-
 .../AMDGPU/shufflevector.v2i64.v2i64.ll       |  234 +-
 .../AMDGPU/shufflevector.v2i64.v3i64.ll       |  575 +-
 .../AMDGPU/shufflevector.v2i64.v4i64.ll       | 1273 ++-
 .../AMDGPU/shufflevector.v2i64.v8i64.ll       | 4339 ++++++----
 .../CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll |  234 +-
 .../CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll |  575 +-
 .../CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll | 1273 ++-
 .../AMDGPU/shufflevector.v3i64.v2i64.ll       |  663 +-
 .../AMDGPU/shufflevector.v3i64.v3i64.ll       | 1856 ++--
 .../AMDGPU/shufflevector.v3i64.v4i64.ll       | 4987 ++++++-----
 .../CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll |  663 +-
 .../CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll | 1856 ++--
 .../CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll | 4987 ++++++-----
 .../AMDGPU/shufflevector.v4i64.v2i64.ll       | 2213 +++--
 .../AMDGPU/shufflevector.v4i64.v3i64.ll       | 2400 +++---
 .../AMDGPU/shufflevector.v4i64.v4i64.ll       | 7502 +++++++++--------
 .../CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll | 2213 +++--
 .../CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll | 2400 +++---
 .../CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll | 7502 +++++++++--------
 llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll   |   25 +-
 .../test/CodeGen/AMDGPU/vector-reduce-fmax.ll |    7 +-
 .../test/CodeGen/AMDGPU/vector-reduce-fmin.ll |    5 +-
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll |   27 +-
 53 files changed, 33321 insertions(+), 23123 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 39b42002b907a..fc7c4da9d908c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -440,6 +440,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
   EVT EltVT = VT.getVectorElementType();
   SDLoc DL(N);
   SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
+  unsigned NumRegs = EltVT.getSizeInBits() / 32;
+  bool IsGCN = TM.getTargetTriple().isAMDGCN();
 
   if (NumVectorElts == 1) {
     CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
@@ -447,7 +449,6 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
     return;
   }
 
-  bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
   if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
       CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
     uint64_t C = 0;
@@ -478,6 +479,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
 
   assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
                                   "supported yet");
+  assert((IsGCN || (!IsGCN && NumRegs == 1)) &&
+         "R600 does not support 64-bit reg_seq elements");
   // 32 = Max Num Vector Elements
   // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
   // 1 = Vector Register Class
@@ -492,8 +495,9 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
       IsRegSeq = false;
       break;
     }
-    unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
-                         : R600RegisterInfo::getSubRegFromChannel(i);
+    unsigned Sub =
+        IsGCN ? SIRegisterInfo::getSubRegFromChannel(i * NumRegs, NumRegs)
+              : R600RegisterInfo::getSubRegFromChannel(i);
     RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
     RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
   }
@@ -503,8 +507,9 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
     MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
                                                    DL, EltVT);
     for (unsigned i = NOps; i < NumVectorElts; ++i) {
-      unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
-                           : R600RegisterInfo::getSubRegFromChannel(i);
+      unsigned Sub =
+          IsGCN ? SIRegisterInfo::getSubRegFromChannel(i * NumRegs, NumRegs)
+                : R600RegisterInfo::getSubRegFromChannel(i);
       RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
       RegSeqArgs[1 + (2 * i) + 1] =
           CurDAG->getTargetConstant(Sub, DL, MVT::i32);
@@ -672,9 +677,12 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
       break;
     }
 
-    assert(VT.getVectorElementType().bitsEq(MVT::i32));
+    EVT VET = VT.getVectorElementType();
+    assert(VET.bitsEq(MVT::i32) || VET.bitsEq(MVT::i64));
+    unsigned EltSize = VET.getSizeInBits();
     unsigned RegClassID =
-        SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
+        SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * EltSize)
+            ->getID();
     SelectBuildVector(N, RegClassID);
     return;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ad267576a84af..9dfc6b154d6f1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -359,9 +359,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   // Most operations are naturally 32-bit vector operations. We only support
   // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
   for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
-    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
-    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
-
+    if (!STI.hasMovB64()) {
+      setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+      AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
+    }
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
 
@@ -373,9 +374,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   }
 
   for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
-    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
-    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
-
+    if (!STI.hasMovB64()) {
+      setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+      AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
+    }
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
 
@@ -387,9 +389,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   }
 
   for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
-    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
-    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
-
+    if (!STI.hasMovB64()) {
+      setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+      AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
+    }
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
 
@@ -401,9 +404,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   }
 
   for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
-    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
-    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
-
+    if (!STI.hasMovB64()) {
+      setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+      AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
+    }
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
 
@@ -415,9 +419,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   }
 
   for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
-    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
-    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
-
+    if (!STI.hasMovB64()) {
+      setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+      AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
+    }
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
 
@@ -968,6 +973,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setTargetDAGCombine({ISD::ADD,
                        ISD::PTRADD,
+                       ISD::BITCAST,
+                       ISD::BUILD_VECTOR,
                        ISD::UADDO_CARRY,
                        ISD::SUB,
                        ISD::USUBO_CARRY,
@@ -14372,6 +14379,18 @@ bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
       EltSize, NumElem, Idx->isDivergent(), getSubtarget());
 }
 
+static unsigned getMappedVectorIndex(unsigned Idx, EVT From, EVT To) {
+  assert(From.isVector() && To.isVector() &&
+         "From and To types have to be vector types");
+  assert(From.getSizeInBits() == To.getSizeInBits() &&
+         "From and To vector types require the same size");
+
+  unsigned FromNumElts = From.getVectorNumElements();
+  unsigned ToNumElts = To.getVectorNumElements();
+
+  return (Idx * ToNumElts) / FromNumElts;
+}
+
 SDValue
 SITargetLowering::performExtractVectorEltCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
@@ -14434,6 +14453,27 @@ SITargetLowering::performExtractVectorEltCombine(SDNode *N,
     }
   }
 
+  // if PeekThoughBitcast(Vec)[MapIdx(CIdx)] == undef &&
+  //    VecEltSize < PeekThroughEltSize, then
+  // EXTRACT_VECTOR_ELT(bitcast(build_vector(..., undef, ...)), CIdx) => undef
+  auto *IndexC = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  SDValue PeekThroughVec = peekThroughBitcasts(Vec);
+  EVT PeekThroughVecVT = PeekThroughVec.getValueType();
+  if (IndexC && PeekThroughVec.getOpcode() == ISD::BUILD_VECTOR &&
+      PeekThroughVecVT.isFixedLengthVector()) {
+    EVT PeekThroughVecEltVT = PeekThroughVecVT.getVectorElementType();
+    // Small elt size vectors to big elt size vectors are the cases covered for
+    // now (e.g., v4i32 bitcast(v2i64)) which may be conservative.
+    if (VecEltSize < PeekThroughVecEltVT.getSizeInBits()) {
+      unsigned IndexVal = IndexC->getZExtValue();
+      unsigned MappedIndexVal =
+          getMappedVectorIndex(IndexVal, VecVT, PeekThroughVecVT);
+      SDValue PeekThroughElt = PeekThroughVec.getOperand(MappedIndexVal);
+      if (PeekThroughElt.isUndef())
+        return DAG.getUNDEF(VecEltVT);
+    }
+  }
+
   // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
   if (shouldExpandVectorDynExt(N)) {
     SDLoc SL(N);
@@ -16017,6 +16057,92 @@ SDValue SITargetLowering::performSelectCombine(SDNode *N,
                          SelectLHS, SelectRHS);
 }
 
+SDValue
+SITargetLowering::performBuildVectorCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  const GCNSubtarget *ST = getSubtarget();
+  if (DCI.Level < AfterLegalizeDAG || !ST->hasMovB64())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  unsigned SizeBits = VT.getSizeInBits();
+  unsigned EltSize = EltVT.getSizeInBits();
+
+  // Skip if:
+  //  - Value type isn't multiple of 64 bit (e.g., v3i32), or
+  //  - Element type has already been combined into 64b elements
+  if ((SizeBits % 64) != 0 || EltVT == MVT::i64 || EltVT == MVT::f64)
+    return SDValue();
+
+  // Construct the 64b values.
+  SmallVector<uint64_t, 8> ImmVals;
+  uint64_t ImmVal = 0;
+  uint64_t ImmSize = 0;
+  for (SDValue Opand : N->ops()) {
+    // Build_vector with constants only.
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Opand);
+    ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Opand);
+    BuildVectorSDNode *BV =
+        dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Opand));
+
+    if (!C && !FPC && !BV)
+      return SDValue();
+
+    uint64_t Val = 0;
+    if (BV) {
+      if (!BV->isConstant())
+        return SDValue();
+      bool IsLE = DAG.getDataLayout().isLittleEndian();
+      BitVector UndefElements;
+      SmallVector<APInt> RawBits;
+      if (!BV->getConstantRawBits(IsLE, EltSize, RawBits, UndefElements))
+        return SDValue();
+
+      assert(RawBits.size() == 1 &&
+             "BuildVector constant value retrieval expected 1 element");
+
+      if (UndefElements.any())
+        return SDValue();
+
+      Val = RawBits[0].getZExtValue();
+    } else
+      Val = C ? C->getZExtValue()
+              : FPC->getValueAPF().bitcastToAPInt().getZExtValue();
+    ImmVal |= Val << ImmSize;
+    ImmSize += EltSize;
+    if (ImmSize == 64) {
+      if (!isUInt<32>(ImmVal))
+        return SDValue();
+      ImmVals.push_back(ImmVal);
+      ImmVal = 0;
+      ImmSize = 0;
+    }
+  }
+
+  // Avoid emitting build_vector with 1 element and directly emit value.
+  if (ImmVals.size() == 1) {
+    SDValue Val = DAG.getConstant(ImmVals[0], SL, MVT::i64);
+    return DAG.getBitcast(VT, Val);
+  }
+
+  // Construct and return build_vector with 64b elements.
+  if (!ImmVals.empty()) {
+    SmallVector<SDValue, 8> VectorConsts;
+    for (uint64_t I : ImmVals)
+      VectorConsts.push_back(DAG.getConstant(I, SL, MVT::i64));
+    unsigned NewNumElts = SizeBits / 64;
+    LLVMContext &Ctx = *DAG.getContext();
+    EVT NewVT = EVT::getVectorVT(Ctx, MVT::i64, NewNumElts);
+    SDValue BV = DAG.getBuildVector(NewVT, SL, VectorConsts);
+    return DAG.getBitcast(VT, BV);
+  }
+  return SDValue();
+}
+
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
@@ -16110,6 +16236,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performFCanonicalizeCombine(N, DCI);
   case AMDGPUISD::RCP:
     return performRcpCombine(N, DCI);
+  case ISD::BUILD_VECTOR:
+    return performBuildVectorCombine(N, DCI);
   case ISD::FLDEXP:
   case AMDGPUISD::FRACT:
   case AMDGPUISD::RSQ:
@@ -16162,6 +16290,14 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performInsertVectorEltCombine(N, DCI);
   case ISD::FP_ROUND:
     return performFPRoundCombine(N, DCI);
+  case ISD::BITCAST: {
+    // Avoid undoing build_vector with 64b elements if subtarget supports 64b
+    // movs (i.e., avoid inf loop through combines).
+    const GCNSubtarget *ST = getSubtarget();
+    if (ST->hasMovB64())
+      return SDValue();
+    break;
+  }
   case ISD::LOAD: {
     if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
       return Widened;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index dedd9ae170774..af5ada1723855 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -233,6 +233,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
 
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index 752a87ac3cb73..ab17319d6449e 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -182,24 +182,24 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
 ; GFX-950:       ; %bb.0:
 ; GFX-950-NEXT:    v_cvt_f32_f64_e32 v6, v[2:3]
 ; GFX-950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
-; GFX-950-NEXT:    v_and_b32_e32 v7, 1, v6
+; GFX-950-NEXT:    v_and_b32_e32 v8, 1, v6
 ; GFX-950-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[4:5]|
 ; GFX-950-NEXT:    v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5]
-; GFX-950-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v7
+; GFX-950-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v8
 ; GFX-950-NEXT:    v_cndmask_b32_e64 v2, -1, 1, s[2:3]
+; GFX-950-NEXT:    v_cvt_f32_f64_e32 v7, v[0:1]
 ; GFX-950-NEXT:    v_add_u32_e32 v2, v6, v2
 ; GFX-950-NEXT:    s_or_b64 vcc, s[0:1], vcc
-; GFX-950-NEXT:    v_cvt_f32_f64_e32 v5, v[0:1]
 ; GFX-950-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX-950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
-; GFX-950-NEXT:    v_and_b32_e32 v6, 1, v5
+; GFX-950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v7
+; GFX-950-NEXT:    v_and_b32_e32 v5, 1, v7
 ; GFX-950-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]|
 ; GFX-950-NEXT:    v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3]
-; GFX-950-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v6
+; GFX-950-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v5
 ; GFX-950-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[2:3]
-; GFX-950-NEXT:    v_add_u32_e32 v0, v5, v0
+; GFX-950-NEXT:    v_add_u32_e32 v0, v7, v0
 ; GFX-950-NEXT:    s_or_b64 vcc, s[0:1], vcc
-; GFX-950-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX-950-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v4
 ; GFX-950-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 52e697cae9fe5..32da8865754e6 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -6952,50 +6952,95 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v32bf16_to_v32f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
-; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
-; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
-; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
-; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: global_extload_v32bf16_to_v32f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX900-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
+; GFX900-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:32
+; GFX900-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
+; GFX900-NEXT:    s_waitcnt vmcnt(3)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    s_waitcnt vmcnt(2)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX900-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX900-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GFX900-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GFX900-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GFX900-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX900-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: global_extload_v32bf16_to_v32f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16
+; GFX950-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:32
+; GFX950-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
+; GFX950-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off
+; GFX950-NEXT:    s_waitcnt vmcnt(3)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX950-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
+; GFX950-NEXT:    s_waitcnt vmcnt(2)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX950-NEXT:    s_waitcnt vmcnt(1)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GFX950-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GFX950-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GFX950-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX950-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v32
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v34
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v34
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v35
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v35
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v32bf16_to_v32f32:
 ; GFX10:       ; %bb.0:
@@ -7151,10 +7196,10 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v1
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v2bf16_to_v2f64:
@@ -7250,12 +7295,12 @@ define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v3bf16_to_v3f64:
@@ -7344,20 +7389,35 @@ define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v4bf16_to_v4f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: global_extload_v4bf16_to_v4f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: global_extload_v4bf16_to_v4f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[6:7], v2
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v4bf16_to_v4f64:
 ; GFX10:       ; %bb.0:
@@ -7459,22 +7519,39 @@ define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v5bf16_to_v5f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: global_extload_v5bf16_to_v5f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: global_extload_v5bf16_to_v5f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[6:7], v3
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v10
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v5bf16_to_v5f64:
 ; GFX10:       ; %bb.0:
@@ -7580,24 +7657,43 @@ define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v6bf16_to_v6f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: global_extload_v6bf16_to_v6f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: global_extload_v6bf16_to_v6f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[10:11], v3
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v12
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v6bf16_to_v6f64:
 ; GFX10:       ; %bb.0:
@@ -7719,28 +7815,51 @@ define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v8bf16_to_v8f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: global_extload_v8bf16_to_v8f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX900-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: global_extload_v8bf16_to_v8f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_and_b32_e32 v16, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[14:15], v4
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[12:13], v3
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[10:11], v5
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v16
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v8bf16_to_v8f64:
 ; GFX10:       ; %bb.0:
@@ -7926,46 +8045,87 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v16bf16_to_v16f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
-; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: global_extload_v16bf16_to_v16f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
+; GFX900-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX900-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
+; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
+; GFX900-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
+; GFX900-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: global_extload_v16bf16_to_v16f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16
+; GFX950-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off
+; GFX950-NEXT:    s_waitcnt vmcnt(1)
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v34, 16, v7
+; GFX950-NEXT:    v_and_b32_e32 v35, 0xffff0000, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v6
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[30:31], v0
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[28:29], v1
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[26:27], v5
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[24:25], v4
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[22:23], v10
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[20:21], v3
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[18:19], v11
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[16:17], v2
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[14:15], v12
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[12:13], v9
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[10:11], v32
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[6:7], v33
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v34
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v35
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[0:1], v36
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v16bf16_to_v16f64:
 ; GFX10:       ; %bb.0:
@@ -9068,141 +9228,150 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX950-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a8, v56 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a9, v57 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
-; GFX950-NEXT:    global_load_ushort v1, v[2:3], off offset:2
-; GFX950-NEXT:    global_load_ushort v4, v[2:3], off offset:12
-; GFX950-NEXT:    global_load_ushort v5, v[2:3], off offset:8
-; GFX950-NEXT:    global_load_ushort v6, v[2:3], off offset:4
-; GFX950-NEXT:    global_load_ushort v7, v[2:3], off
-; GFX950-NEXT:    global_load_ushort v8, v[2:3], off offset:6
-; GFX950-NEXT:    global_load_ushort v9, v[2:3], off offset:10
-; GFX950-NEXT:    global_load_ushort v10, v[2:3], off offset:14
-; GFX950-NEXT:    global_load_ushort v11, v[2:3], off offset:18
-; GFX950-NEXT:    global_load_ushort v12, v[2:3], off offset:28
-; GFX950-NEXT:    global_load_ushort v13, v[2:3], off offset:24
-; GFX950-NEXT:    global_load_ushort v14, v[2:3], off offset:20
-; GFX950-NEXT:    global_load_ushort v15, v[2:3], off offset:16
-; GFX950-NEXT:    global_load_ushort v16, v[2:3], off offset:22
-; GFX950-NEXT:    global_load_ushort v17, v[2:3], off offset:26
-; GFX950-NEXT:    global_load_ushort v18, v[2:3], off offset:30
-; GFX950-NEXT:    global_load_ushort v19, v[2:3], off offset:34
-; GFX950-NEXT:    global_load_ushort v20, v[2:3], off offset:44
-; GFX950-NEXT:    global_load_ushort v21, v[2:3], off offset:40
-; GFX950-NEXT:    global_load_ushort v22, v[2:3], off offset:36
-; GFX950-NEXT:    global_load_ushort v23, v[2:3], off offset:32
-; GFX950-NEXT:    global_load_ushort v24, v[2:3], off offset:38
-; GFX950-NEXT:    global_load_ushort v25, v[2:3], off offset:42
-; GFX950-NEXT:    global_load_ushort v26, v[2:3], off offset:46
-; GFX950-NEXT:    global_load_ushort v42, v[2:3], off offset:50
-; GFX950-NEXT:    global_load_ushort v43, v[2:3], off offset:62
-; GFX950-NEXT:    global_load_ushort v46, v[2:3], off offset:60
-; GFX950-NEXT:    global_load_ushort v47, v[2:3], off offset:56
-; GFX950-NEXT:    global_load_ushort v60, v[2:3], off offset:52
-; GFX950-NEXT:    global_load_ushort v56, v[2:3], off offset:48
-; GFX950-NEXT:    global_load_ushort v57, v[2:3], off offset:54
-; GFX950-NEXT:    global_load_ushort v58, v[2:3], off offset:58
+; GFX950-NEXT:    v_accvgpr_write_b32 a13, v61 ; Reload Reuse
+; GFX950-NEXT:    global_load_ushort v1, v[2:3], off offset:30
+; GFX950-NEXT:    global_load_ushort v4, v[2:3], off offset:28
+; GFX950-NEXT:    global_load_ushort v5, v[2:3], off offset:26
+; GFX950-NEXT:    global_load_ushort v6, v[2:3], off offset:24
+; GFX950-NEXT:    global_load_ushort v7, v[2:3], off offset:22
+; GFX950-NEXT:    global_load_ushort v8, v[2:3], off offset:20
+; GFX950-NEXT:    global_load_ushort v9, v[2:3], off offset:18
+; GFX950-NEXT:    global_load_ushort v10, v[2:3], off offset:16
+; GFX950-NEXT:    global_load_ushort v11, v[2:3], off offset:14
+; GFX950-NEXT:    global_load_ushort v12, v[2:3], off offset:12
+; GFX950-NEXT:    global_load_ushort v13, v[2:3], off offset:10
+; GFX950-NEXT:    global_load_ushort v14, v[2:3], off offset:8
+; GFX950-NEXT:    global_load_ushort v15, v[2:3], off offset:6
+; GFX950-NEXT:    global_load_ushort v16, v[2:3], off offset:4
+; GFX950-NEXT:    global_load_ushort v17, v[2:3], off offset:2
+; GFX950-NEXT:    global_load_ushort v18, v[2:3], off
+; GFX950-NEXT:    global_load_ushort v19, v[2:3], off offset:62
+; GFX950-NEXT:    global_load_ushort v20, v[2:3], off offset:60
+; GFX950-NEXT:    global_load_ushort v21, v[2:3], off offset:58
+; GFX950-NEXT:    global_load_ushort v22, v[2:3], off offset:56
+; GFX950-NEXT:    global_load_ushort v23, v[2:3], off offset:54
+; GFX950-NEXT:    global_load_ushort v24, v[2:3], off offset:52
+; GFX950-NEXT:    global_load_ushort v25, v[2:3], off offset:50
+; GFX950-NEXT:    global_load_ushort v26, v[2:3], off offset:48
+; GFX950-NEXT:    global_load_ushort v42, v[2:3], off offset:46
+; GFX950-NEXT:    global_load_ushort v43, v[2:3], off offset:44
+; GFX950-NEXT:    global_load_ushort v46, v[2:3], off offset:42
+; GFX950-NEXT:    global_load_ushort v47, v[2:3], off offset:40
+; GFX950-NEXT:    global_load_ushort v56, v[2:3], off offset:38
+; GFX950-NEXT:    global_load_ushort v57, v[2:3], off offset:36
+; GFX950-NEXT:    global_load_ushort v60, v[2:3], off offset:34
+; GFX950-NEXT:    global_load_ushort v61, v[2:3], off offset:32
 ; GFX950-NEXT:    v_accvgpr_write_b32 a4, v44 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a11, v59 ; Reload Reuse
 ; GFX950-NEXT:    s_waitcnt vmcnt(31)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX950-NEXT:    s_waitcnt vmcnt(30)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX950-NEXT:    s_waitcnt vmcnt(29)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v5
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX950-NEXT:    s_waitcnt vmcnt(28)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX950-NEXT:    s_waitcnt vmcnt(27)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX950-NEXT:    s_waitcnt vmcnt(26)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
 ; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[32:33], v1
 ; GFX950-NEXT:    s_waitcnt vmcnt(24)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
 ; GFX950-NEXT:    s_waitcnt vmcnt(23)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v31, 16, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
 ; GFX950-NEXT:    s_waitcnt vmcnt(22)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
 ; GFX950-NEXT:    s_waitcnt vmcnt(21)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v13
 ; GFX950-NEXT:    s_waitcnt vmcnt(20)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
-; GFX950-NEXT:    s_waitcnt vmcnt(19)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
 ; GFX950-NEXT:    s_waitcnt vmcnt(18)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v33, 16, v16
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[12:13], v27
+; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v16
+; GFX950-NEXT:    s_waitcnt vmcnt(17)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v17
 ; GFX950-NEXT:    s_waitcnt vmcnt(16)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
 ; GFX950-NEXT:    s_waitcnt vmcnt(15)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v19
+; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v15
 ; GFX950-NEXT:    s_waitcnt vmcnt(14)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v44, 16, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
 ; GFX950-NEXT:    s_waitcnt vmcnt(13)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v21
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[14:15], v30
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[20:21], v31
+; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v21
+; GFX950-NEXT:    s_waitcnt vmcnt(12)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX950-NEXT:    s_waitcnt vmcnt(11)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v23
 ; GFX950-NEXT:    s_waitcnt vmcnt(10)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v24
-; GFX950-NEXT:    s_waitcnt vmcnt(9)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v25
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v24
 ; GFX950-NEXT:    s_waitcnt vmcnt(8)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v26
+; GFX950-NEXT:    v_lshlrev_b32_e32 v44, 16, v26
 ; GFX950-NEXT:    s_waitcnt vmcnt(7)
 ; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v42
-; GFX950-NEXT:    s_waitcnt vmcnt(6)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v42, 16, v43
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[18:19], v32
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[24:25], v33
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[26:27], v36
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[32:33], v37
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[30:31], v38
+; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v25
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[30:31], v2
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[28:29], v3
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[26:27], v4
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[24:25], v5
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[22:23], v6
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[20:21], v7
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[6:7], v36
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v37
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v38
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[36:37], v39
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[38:39], v44
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[44:45], v42
-; GFX950-NEXT:    s_waitcnt vmcnt(5)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v42, 16, v46
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[42:43], v42
-; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v46, 16, v58
-; GFX950-NEXT:    scratch_store_dwordx4 v0, v[42:45], off offset:240
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[58:59], v46
-; GFX950-NEXT:    v_lshlrev_b32_e32 v46, 16, v47
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[44:45], v1
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v56
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[42:43], v1
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v57
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[56:57], v46
-; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v17
-; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v23
-; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
-; GFX950-NEXT:    scratch_store_dwordx4 v0, v[56:59], off offset:224
-; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[10:11], v28
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[58:59], v1
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v60
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[16:17], v29
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[22:23], v34
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[28:29], v35
+; GFX950-NEXT:    s_waitcnt vmcnt(6)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v43
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[18:19], v8
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[16:17], v9
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[14:15], v10
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[12:13], v11
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[10:11], v34
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[8:9], v35
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[34:35], v48
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[42:43], v1
+; GFX950-NEXT:    s_waitcnt vmcnt(5)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v46
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[34:37], off offset:240
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[50:51], v49
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[48:49], v52
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[36:37], v1
+; GFX950-NEXT:    s_waitcnt vmcnt(5)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v47
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[34:35], v1
+; GFX950-NEXT:    s_waitcnt vmcnt(4)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v56
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[58:59], v1
+; GFX950-NEXT:    s_waitcnt vmcnt(3)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v57
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[56:57], v1
+; GFX950-NEXT:    s_waitcnt vmcnt(2)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v60
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[54:55], v53
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[52:53], v40
 ; GFX950-NEXT:    v_cvt_f64_f32_e32 v[40:41], v41
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[56:57], v1
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[8:9], v7
-; GFX950-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX950-NEXT:    scratch_store_dwordx4 v0, v[56:59], off offset:208
-; GFX950-NEXT:    scratch_store_dwordx4 v0, v[42:45], off offset:192
-; GFX950-NEXT:    scratch_store_dwordx4 v0, v[38:41], off offset:176
-; GFX950-NEXT:    scratch_store_dwordx4 v0, v[52:55], off offset:160
-; GFX950-NEXT:    scratch_store_dwordx4 v0, v[48:51], off offset:144
-; GFX950-NEXT:    scratch_store_dwordx4 v0, v[34:37], off offset:128
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[48:51], off offset:224
+; GFX950-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v47, a7 ; Reload Reuse
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[50:51], v1
+; GFX950-NEXT:    s_waitcnt vmcnt(2)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v61
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[48:49], v1
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[52:55], off offset:208
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[38:41], off offset:192
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[42:45], off offset:176
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[34:37], off offset:160
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[56:59], off offset:144
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[48:51], off offset:128
 ; GFX950-NEXT:    scratch_store_dwordx4 v0, v[30:33], off offset:112
 ; GFX950-NEXT:    scratch_store_dwordx4 v0, v[26:29], off offset:96
 ; GFX950-NEXT:    scratch_store_dwordx4 v0, v[22:25], off offset:80
@@ -9211,12 +9380,11 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX950-NEXT:    scratch_store_dwordx4 v0, v[10:13], off offset:32
 ; GFX950-NEXT:    scratch_store_dwordx4 v0, v[6:9], off offset:16
 ; GFX950-NEXT:    scratch_store_dwordx4 v0, v[2:5], off
-; GFX950-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v61, a13 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_read_b32 v59, a11 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_read_b32 v58, a10 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v47, a7 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_read_b32 v46, a6 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
@@ -34060,34 +34228,34 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
 ; GFX950-LABEL: v_fptosi_v2bf16_to_v2i64:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
 ; GFX950-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX950-NEXT:    s_mov_b32 s0, 0x2f800000
 ; GFX950-NEXT:    v_mul_f32_e64 v2, |v1|, s0
-; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX950-NEXT:    v_floor_f32_e32 v2, v2
 ; GFX950-NEXT:    s_mov_b32 s1, 0xcf800000
-; GFX950-NEXT:    v_trunc_f32_e32 v4, v0
 ; GFX950-NEXT:    v_fma_f32 v3, v2, s1, |v1|
-; GFX950-NEXT:    v_mul_f32_e64 v0, |v4|, s0
-; GFX950-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX950-NEXT:    v_floor_f32_e32 v0, v0
 ; GFX950-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX950-NEXT:    v_fma_f32 v5, v0, s1, |v4|
-; GFX950-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX950-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
-; GFX950-NEXT:    v_cvt_u32_f32_e32 v6, v0
-; GFX950-NEXT:    v_xor_b32_e32 v3, v3, v1
-; GFX950-NEXT:    v_xor_b32_e32 v2, v2, v1
-; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v3, v1
-; GFX950-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX950-NEXT:    v_xor_b32_e32 v2, v5, v3
-; GFX950-NEXT:    v_xor_b32_e32 v4, v6, v3
-; GFX950-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX950-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX950-NEXT:    v_xor_b32_e32 v4, v2, v1
+; GFX950-NEXT:    v_xor_b32_e32 v2, v3, v1
+; GFX950-NEXT:    v_mul_f32_e64 v3, |v0|, s0
+; GFX950-NEXT:    v_floor_f32_e32 v3, v3
+; GFX950-NEXT:    v_fma_f32 v5, v3, s1, |v0|
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v6, v3
+; GFX950-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX950-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v1, vcc
+; GFX950-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX950-NEXT:    v_xor_b32_e32 v0, v5, v1
+; GFX950-NEXT:    v_xor_b32_e32 v4, v6, v1
+; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_v2bf16_to_v2i64:
@@ -34343,48 +34511,47 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
 ; GFX950-LABEL: v_fptosi_v3bf16_to_v3i64:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX950-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX950-NEXT:    s_mov_b32 s0, 0x2f800000
-; GFX950-NEXT:    v_mul_f32_e64 v3, |v2|, s0
-; GFX950-NEXT:    v_floor_f32_e32 v3, v3
+; GFX950-NEXT:    v_mul_f32_e64 v2, |v1|, s0
+; GFX950-NEXT:    v_floor_f32_e32 v2, v2
 ; GFX950-NEXT:    s_mov_b32 s1, 0xcf800000
-; GFX950-NEXT:    v_fma_f32 v4, v3, s1, |v2|
-; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX950-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX950-NEXT:    v_trunc_f32_e32 v5, v0
+; GFX950-NEXT:    v_fma_f32 v3, v2, s1, |v1|
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX950-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX950-NEXT:    v_mul_f32_e64 v0, |v5|, s0
-; GFX950-NEXT:    v_floor_f32_e32 v0, v0
-; GFX950-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
-; GFX950-NEXT:    v_fma_f32 v6, v0, s1, |v5|
-; GFX950-NEXT:    v_xor_b32_e32 v4, v4, v2
-; GFX950-NEXT:    v_cvt_u32_f32_e32 v7, v6
-; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX950-NEXT:    v_xor_b32_e32 v3, v3, v2
-; GFX950-NEXT:    v_cvt_u32_f32_e32 v8, v0
-; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v4, v2
-; GFX950-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_subb_co_u32_e32 v6, vcc, v3, v2, vcc
-; GFX950-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX950-NEXT:    v_mul_f32_e64 v5, |v1|, s0
-; GFX950-NEXT:    v_floor_f32_e32 v5, v5
-; GFX950-NEXT:    v_xor_b32_e32 v2, v7, v3
-; GFX950-NEXT:    v_fma_f32 v7, v5, s1, |v1|
-; GFX950-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GFX950-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX950-NEXT:    v_xor_b32_e32 v4, v8, v3
-; GFX950-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX950-NEXT:    v_trunc_f32_e32 v6, v4
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX950-NEXT:    v_mul_f32_e64 v4, |v6|, s0
+; GFX950-NEXT:    v_floor_f32_e32 v4, v4
 ; GFX950-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX950-NEXT:    v_fma_f32 v5, v4, s1, |v6|
+; GFX950-NEXT:    v_xor_b32_e32 v3, v3, v1
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v7, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_xor_b32_e32 v2, v2, v1
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v8, v4
+; GFX950-NEXT:    v_sub_co_u32_e32 v4, vcc, v3, v1
+; GFX950-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
-; GFX950-NEXT:    v_xor_b32_e32 v4, v7, v1
-; GFX950-NEXT:    v_xor_b32_e32 v5, v5, v1
-; GFX950-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v1
+; GFX950-NEXT:    v_subb_co_u32_e32 v5, vcc, v2, v1, vcc
+; GFX950-NEXT:    v_ashrrev_i32_e32 v1, 31, v6
+; GFX950-NEXT:    v_mul_f32_e64 v6, |v0|, s0
+; GFX950-NEXT:    v_floor_f32_e32 v6, v6
+; GFX950-NEXT:    v_xor_b32_e32 v2, v7, v1
+; GFX950-NEXT:    v_fma_f32 v7, v6, s1, |v0|
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX950-NEXT:    v_xor_b32_e32 v3, v8, v1
+; GFX950-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v1, vcc
+; GFX950-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX950-NEXT:    v_xor_b32_e32 v0, v7, v1
+; GFX950-NEXT:    v_xor_b32_e32 v6, v6, v1
+; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v1, v6
+; GFX950-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_v3bf16_to_v3i64:
@@ -34713,60 +34880,59 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
 ; GFX950-LABEL: v_fptosi_v4bf16_to_v4i64:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX950-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX950-NEXT:    s_mov_b32 s0, 0x2f800000
 ; GFX950-NEXT:    v_mul_f32_e64 v3, |v2|, s0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX950-NEXT:    v_floor_f32_e32 v3, v3
 ; GFX950-NEXT:    s_mov_b32 s1, 0xcf800000
+; GFX950-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX950-NEXT:    v_fma_f32 v4, v3, s1, |v2|
-; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT:    v_mul_f32_e64 v5, |v1|, s0
 ; GFX950-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX950-NEXT:    v_trunc_f32_e32 v5, v0
+; GFX950-NEXT:    v_floor_f32_e32 v5, v5
 ; GFX950-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX950-NEXT:    v_mul_f32_e64 v0, |v5|, s0
-; GFX950-NEXT:    v_floor_f32_e32 v0, v0
+; GFX950-NEXT:    v_fma_f32 v6, v5, s1, |v1|
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v8, v6
 ; GFX950-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
-; GFX950-NEXT:    v_fma_f32 v6, v0, s1, |v5|
 ; GFX950-NEXT:    v_xor_b32_e32 v4, v4, v2
-; GFX950-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX950-NEXT:    v_xor_b32_e32 v3, v3, v2
-; GFX950-NEXT:    v_cvt_u32_f32_e32 v7, v0
-; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v4, v2
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_subb_co_u32_e32 v8, vcc, v3, v2, vcc
-; GFX950-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX950-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX950-NEXT:    v_xor_b32_e32 v2, v6, v3
-; GFX950-NEXT:    v_mul_f32_e64 v6, |v5|, s0
-; GFX950-NEXT:    v_floor_f32_e32 v6, v6
-; GFX950-NEXT:    v_xor_b32_e32 v4, v7, v3
-; GFX950-NEXT:    v_fma_f32 v7, v6, s1, |v5|
-; GFX950-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX950-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
-; GFX950-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX950-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX950-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
-; GFX950-NEXT:    v_xor_b32_e32 v4, v7, v5
-; GFX950-NEXT:    v_mul_f32_e64 v7, |v1|, s0
-; GFX950-NEXT:    v_floor_f32_e32 v7, v7
-; GFX950-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX950-NEXT:    v_fma_f32 v9, v7, s1, |v1|
-; GFX950-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GFX950-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GFX950-NEXT:    v_xor_b32_e32 v6, v6, v5
-; GFX950-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v5
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX950-NEXT:    v_sub_co_u32_e32 v6, vcc, v4, v2
 ; GFX950-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT:    v_subb_co_u32_e32 v7, vcc, v3, v2, vcc
+; GFX950-NEXT:    v_xor_b32_e32 v3, v8, v1
+; GFX950-NEXT:    v_trunc_f32_e32 v8, v4
+; GFX950-NEXT:    v_mul_f32_e64 v4, |v8|, s0
+; GFX950-NEXT:    v_floor_f32_e32 v4, v4
+; GFX950-NEXT:    v_xor_b32_e32 v2, v5, v1
+; GFX950-NEXT:    v_fma_f32 v5, v4, s1, |v8|
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v9, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v10, v4
+; GFX950-NEXT:    v_sub_co_u32_e32 v4, vcc, v3, v1
+; GFX950-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
-; GFX950-NEXT:    v_xor_b32_e32 v6, v9, v1
-; GFX950-NEXT:    v_xor_b32_e32 v7, v7, v1
-; GFX950-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v1
+; GFX950-NEXT:    v_subb_co_u32_e32 v5, vcc, v2, v1, vcc
+; GFX950-NEXT:    v_ashrrev_i32_e32 v1, 31, v8
+; GFX950-NEXT:    v_mul_f32_e64 v8, |v0|, s0
+; GFX950-NEXT:    v_floor_f32_e32 v8, v8
+; GFX950-NEXT:    v_xor_b32_e32 v2, v9, v1
+; GFX950-NEXT:    v_fma_f32 v9, v8, s1, |v0|
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GFX950-NEXT:    v_xor_b32_e32 v3, v10, v1
+; GFX950-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v1, vcc
+; GFX950-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX950-NEXT:    v_xor_b32_e32 v0, v9, v1
+; GFX950-NEXT:    v_xor_b32_e32 v8, v8, v1
+; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v1
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v1, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v1, v8
+; GFX950-NEXT:    v_subb_co_u32_e32 v1, vcc, v8, v1, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i64:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
index b08e9c439a9fe..74ae44d5210e3 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
 
 define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
@@ -167,25 +167,41 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offset_no_rtn(float %val, ptr
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
   ;
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
   ret void
 }
@@ -212,26 +228,43 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offen_no_rtn(float %val, ptr a
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
   ;
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
@@ -258,26 +291,43 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_idxen_no_rtn(float %val, ptr a
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
   ;
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret void
 }
@@ -306,28 +356,47 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_bothen_no_rtn(float %val, ptr
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
   ;
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll
index b80aa9324e616..de41cfabc5fd4 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s
 
 define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
@@ -153,26 +153,43 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32>
 }
 
 define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn
   ; GFX11: bb.0 (%ir-block.0):
@@ -199,27 +216,45 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr ad
 }
 
 define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn
   ; GFX11: bb.0 (%ir-block.0):
@@ -247,27 +282,45 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr add
 }
 
 define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn
   ; GFX11: bb.0 (%ir-block.0):
@@ -295,29 +348,49 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr add
 }
 
 define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_bothen_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn
   ; GFX11: bb.0 (%ir-block.0):
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll
index 96b191d5acead..3cfaa43aaf506 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFx90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
 
 define amdgpu_ps void @buffer_atomic_fadd_f64_offset_no_rtn(double %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
   ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_f64_offset_no_rtn
@@ -207,251 +207,655 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32
 }
 
 define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_offset_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_ADD_F64_OFFSET killed [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_ADD_F64_OFFSET killed [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX942-NEXT:   BUFFER_ATOMIC_ADD_F64_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFx90A-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_no_rtn
+  ; GFx90A: bb.0 (%ir-block.0):
+  ; GFx90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFx90A-NEXT: {{  $}}
+  ; GFx90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFx90A-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFx90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFx90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFx90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFx90A-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFx90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFx90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFx90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFx90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFx90A-NEXT:   BUFFER_ATOMIC_ADD_F64_OFFSET killed [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFx90A-NEXT:   S_ENDPGM 0
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_offen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_ADD_F64_OFFEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_ADD_F64_OFFEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX942-NEXT:   [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX942-NEXT:   BUFFER_ATOMIC_ADD_F64_OFFEN killed [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFx90A-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_no_rtn
+  ; GFx90A: bb.0 (%ir-block.0):
+  ; GFx90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFx90A-NEXT: {{  $}}
+  ; GFx90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFx90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFx90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFx90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFx90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFx90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFx90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFx90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFx90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFx90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFx90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFx90A-NEXT:   BUFFER_ATOMIC_ADD_F64_OFFEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFx90A-NEXT:   S_ENDPGM 0
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_idxen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_ADD_F64_IDXEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_ADD_F64_IDXEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX942-NEXT:   [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX942-NEXT:   BUFFER_ATOMIC_ADD_F64_IDXEN killed [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFx90A-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_no_rtn
+  ; GFx90A: bb.0 (%ir-block.0):
+  ; GFx90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFx90A-NEXT: {{  $}}
+  ; GFx90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFx90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFx90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFx90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFx90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFx90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFx90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFx90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFx90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFx90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFx90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFx90A-NEXT:   BUFFER_ATOMIC_ADD_F64_IDXEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFx90A-NEXT:   S_ENDPGM 0
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_bothen_no_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_ADD_F64_BOTHEN killed [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; GFX90A-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_ADD_F64_BOTHEN killed [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX942-NEXT:   BUFFER_ATOMIC_ADD_F64_BOTHEN killed [[COPY9]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFx90A-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_no_rtn
+  ; GFx90A: bb.0 (%ir-block.0):
+  ; GFx90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4
+  ; GFx90A-NEXT: {{  $}}
+  ; GFx90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFx90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFx90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFx90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFx90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFx90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFx90A-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFx90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFx90A-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFx90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFx90A-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
+  ; GFx90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; GFx90A-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFx90A-NEXT:   BUFFER_ATOMIC_ADD_F64_BOTHEN killed [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFx90A-NEXT:   S_ENDPGM 0
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
   ret void
 }
 
 define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
-  ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
-  ; GFX90A_GFX942-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
-  ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
-  ; GFX90A_GFX942-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; GFX90A_GFX942-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
+  ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
+  ; GFX90A-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
+  ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+  ; GFX90A-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX90A-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
+  ; GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY8]], implicit $exec
+  ; GFX942-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
+  ; GFX942-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
+  ; GFX942-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX942-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ;
+  ; GFx90A-LABEL: name: buffer_ptr_atomic_fadd_f64_offset_rtn
+  ; GFx90A: bb.0 (%ir-block.0):
+  ; GFx90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFx90A-NEXT: {{  $}}
+  ; GFx90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFx90A-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFx90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFx90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFx90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFx90A-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFx90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFx90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFx90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFx90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFx90A-NEXT:   [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFx90A-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
+  ; GFx90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
+  ; GFx90A-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
+  ; GFx90A-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+  ; GFx90A-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFx90A-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFx90A-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
   ret double %ret
 }
 
 define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
-  ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
-  ; GFX90A_GFX942-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
-  ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
-  ; GFX90A_GFX942-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; GFX90A_GFX942-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
+  ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+  ; GFX90A-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
+  ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+  ; GFX90A-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX90A-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX942-NEXT:   [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
+  ; GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
+  ; GFX942-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
+  ; GFX942-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
+  ; GFX942-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX942-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ;
+  ; GFx90A-LABEL: name: buffer_ptr_atomic_fadd_f64_offen_rtn
+  ; GFx90A: bb.0 (%ir-block.0):
+  ; GFx90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFx90A-NEXT: {{  $}}
+  ; GFx90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFx90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFx90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFx90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFx90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFx90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFx90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFx90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFx90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFx90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFx90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFx90A-NEXT:   [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFx90A-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
+  ; GFx90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+  ; GFx90A-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
+  ; GFx90A-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+  ; GFx90A-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFx90A-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFx90A-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret double %ret
 }
 
 define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_idxen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
-  ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
-  ; GFX90A_GFX942-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
-  ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
-  ; GFX90A_GFX942-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; GFX90A_GFX942-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
+  ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+  ; GFX90A-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
+  ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+  ; GFX90A-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX90A-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX942-NEXT:   [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
+  ; GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
+  ; GFX942-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
+  ; GFX942-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
+  ; GFX942-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX942-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ;
+  ; GFx90A-LABEL: name: buffer_ptr_atomic_fadd_f64_idxen_rtn
+  ; GFx90A: bb.0 (%ir-block.0):
+  ; GFx90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $sgpr4
+  ; GFx90A-NEXT: {{  $}}
+  ; GFx90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFx90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFx90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFx90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFx90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFx90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFx90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFx90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFx90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFx90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFx90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFx90A-NEXT:   [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFx90A-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
+  ; GFx90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+  ; GFx90A-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
+  ; GFx90A-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+  ; GFx90A-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFx90A-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFx90A-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret double %ret
 }
 
 define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_bothen_rtn(double %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
-  ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
-  ; GFX90A_GFX942-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
-  ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec
-  ; GFX90A_GFX942-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; GFX90A_GFX942-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; GFX90A-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
+  ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+  ; GFX90A-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
+  ; GFX90A-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec
+  ; GFX90A-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX90A-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
+  ; GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
+  ; GFX942-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
+  ; GFX942-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec
+  ; GFX942-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX942-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  ;
+  ; GFx90A-LABEL: name: buffer_ptr_atomic_fadd_f64_bothen_rtn
+  ; GFx90A: bb.0 (%ir-block.0):
+  ; GFx90A-NEXT:   liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr2, $vgpr3, $sgpr4
+  ; GFx90A-NEXT: {{  $}}
+  ; GFx90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFx90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFx90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFx90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFx90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFx90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFx90A-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFx90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFx90A-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFx90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFx90A-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFx90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
+  ; GFx90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+  ; GFx90A-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFx90A-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+  ; GFx90A-NEXT:   [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; GFx90A-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
+  ; GFx90A-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+  ; GFx90A-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
+  ; GFx90A-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec
+  ; GFx90A-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFx90A-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFx90A-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret double %ret
 }
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll
index c30b5549776ea..b051674a915ca 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
 
 define amdgpu_ps void @buffer_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
   ; GFX908-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn
@@ -165,25 +165,41 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %va
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
   %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
 }
@@ -210,26 +226,43 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
   %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
@@ -256,26 +289,43 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
   %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret void
 }
@@ -304,28 +354,47 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %va
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   S_ENDPGM 0
   %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll
index 2abd7edade8a1..9a7ba94c3c083 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
 
 define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offset_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
   ; GFX90A_GFX942-LABEL: name: buffer_atomic_fadd_v2f16_offset_rtn
@@ -86,106 +86,179 @@ define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_bothen_rtn(<2 x half> %val
 }
 
 define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_offset_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
   ret <2 x half> %ret
 }
 
 define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_offen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret <2 x half> %ret
 }
 
 define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_idxen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret <2 x half> %ret
 }
 
 define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_bothen_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-  ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_rtn
-  ; GFX90A_GFX942: bb.0 (%ir-block.0):
-  ; GFX90A_GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
-  ; GFX90A_GFX942-NEXT: {{  $}}
-  ; GFX90A_GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; GFX90A_GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-  ; GFX90A_GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; GFX90A_GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; GFX90A_GFX942-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
-  ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
-  ; GFX90A_GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]]
-  ; GFX90A_GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_rtn
+  ; GFX90A: bb.0 (%ir-block.0):
+  ; GFX90A-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX90A-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX90A-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX90A-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX90A-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX90A-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; GFX90A-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX90A-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; GFX90A-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; GFX90A-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+  ; GFX90A-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX90A-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX90A-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]]
+  ; GFX90A-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_rtn
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX942-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; GFX942-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+  ; GFX942-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; GFX942-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GFX942-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX942-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+  ; GFX942-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX942-NEXT:   [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX942-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]]
+  ; GFX942-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
   ret <2 x half> %ret
 }
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index 20795431b4cd8..c0e352f45c724 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -1861,19 +1861,12 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi_immneg128(ptr in
 }
 
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase, i32 %voffset) {
-; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zero_hi:
-; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3]
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, 0, 0x5040100
-; GFX1250-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zero_hi:
-; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3]
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+; GFX1250-LABEL: flat_load_saddr_i16_d16hi_zero_hi:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
   %load = load i16, ptr %gep0
@@ -1883,19 +1876,12 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase,
 }
 
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inreg %sbase, i32 %voffset) {
-; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128:
-; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, 0, 0x5040100
-; GFX1250-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128:
-; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+; GFX1250-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index b25d9b245f5f6..8b62e6a39d824 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -110,10 +110,8 @@ define amdgpu_kernel void @zero_init_kernel() {
 ;
 ; GFX942-LABEL: zero_init_kernel:
 ; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
@@ -304,10 +302,8 @@ define void @zero_init_foo() {
 ; GFX942-LABEL: zero_init_foo:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
@@ -1180,10 +1176,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    scratch_load_dword v0, off, off sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:256
@@ -1397,10 +1391,8 @@ define void @zero_init_small_offset_foo() {
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
@@ -2420,10 +2412,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    s_movk_i32 s0, 0x4004
@@ -2656,10 +2646,8 @@ define void @zero_init_large_offset_foo() {
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    scratch_load_dword v0, off, s32 offset:4 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    s_mov_b32 s1, s0
-; GFX942-NEXT:    s_mov_b32 s2, s0
-; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-NEXT:    s_add_i32 s0, s32, 0x4004
@@ -4139,8 +4127,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX942-LABEL: store_load_i64_aligned:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v2, 15
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], 15
 ; GFX942-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
@@ -4250,8 +4237,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX942-LABEL: store_load_i64_unaligned:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v2, 15
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], 15
 ; GFX942-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
@@ -5010,10 +4996,8 @@ define amdgpu_ps void @large_offset() {
 ;
 ; GFX942-LABEL: large_offset:
 ; GFX942:       ; %bb.0: ; %bb
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 069a47ec97bfe..20e7a15c829a9 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -4010,8 +4010,8 @@ define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
 ; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.maximum.f64(double %a, double %b)
   %max1 = call double @llvm.maximum.f64(double %max0, double %c)
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index d8746b58b16b7..8aebd6f719040 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -4010,8 +4010,8 @@ define <2 x double> @v_no_fminimum3_f64__multi_use(double %a, double %b, double
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
 ; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.minimum.f64(double %a, double %b)
   %max1 = call double @llvm.minimum.f64(double %max0, double %c)
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index 3a898a9214461..ad436d1cb4e7c 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -304,13 +304,13 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG-NEXT:    scratch_store_b32 off, v42, s32 offset:4 scope:SCOPE_SE
 ; GCN-SDAG-NEXT:    scratch_store_b32 off, v43, s32 scope:SCOPE_SE
 ; GCN-SDAG-NEXT:    s_clause 0x7
-; GCN-SDAG-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:112
-; GCN-SDAG-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:96
-; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:80
-; GCN-SDAG-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:48
+; GCN-SDAG-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:112
+; GCN-SDAG-NEXT:    global_load_b128 v[30:33], v[0:1], off offset:96
+; GCN-SDAG-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:80
+; GCN-SDAG-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:48
 ; GCN-SDAG-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:32
-; GCN-SDAG-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:16
-; GCN-SDAG-NEXT:    global_load_b128 v[30:33], v[0:1], off
+; GCN-SDAG-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:16
+; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off
 ; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:64
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[24:25], 0x70
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[50:51], 0x60
@@ -322,61 +322,61 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[48:49], 64
 ; GCN-SDAG-NEXT:    v_dual_mov_b32 v22, 0xc8 :: v_dual_mov_b32 v23, 0
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x7
-; GCN-SDAG-NEXT:    global_store_b128 v[24:25], v[10:13], off
+; GCN-SDAG-NEXT:    global_store_b128 v[24:25], v[34:37], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x6
-; GCN-SDAG-NEXT:    global_store_b128 v[50:51], v[18:21], off
+; GCN-SDAG-NEXT:    global_store_b128 v[50:51], v[30:33], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x5
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x1
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v25, v9
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v24, v28 :: v_dual_mov_b32 v25, v29
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[36:37], v[36:37], v[36:37]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[34:35], v[34:35], v[34:35]
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x4
-; GCN-SDAG-NEXT:    global_store_b128 v[52:53], v[34:37], off
+; GCN-SDAG-NEXT:    global_store_b128 v[52:53], v[18:21], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x3
 ; GCN-SDAG-NEXT:    global_store_b128 v[54:55], v[14:17], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x2
-; GCN-SDAG-NEXT:    global_store_b128 v[40:41], v[26:29], off
+; GCN-SDAG-NEXT:    global_store_b128 v[40:41], v[10:13], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x1
-; GCN-SDAG-NEXT:    global_store_b128 v[42:43], v[30:33], off
+; GCN-SDAG-NEXT:    global_store_b128 v[42:43], v[6:9], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x3
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[52:53], v[2:3], v[2:3]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[50:51], v[0:1], v[0:1]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[26:27], 0xc8, v[26:27]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[28:29], v[28:29], v[28:29]
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x2
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[16:17], 0x64, v[16:17]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x1
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[28:29], v[28:29], v[28:29]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[26:27], v[26:27], v[26:27]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[36:37], v[36:37], v[36:37]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[34:35], v[34:35], v[34:35]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[6:7], v[6:7], v[6:7]
 ; GCN-SDAG-NEXT:    s_clause 0x1
 ; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[22:25], off
 ; GCN-SDAG-NEXT:    global_store_b128 v[48:49], v[0:3], off
 ; GCN-SDAG-NEXT:    s_clause 0x7
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[18:21], off offset:96
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:112
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[30:33], off offset:96
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[34:37], off offset:112
 ; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[50:53], off offset:64
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:80
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[26:29], off offset:80
 ; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[14:17], off offset:32
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[34:37], off offset:48
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[30:33], off
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[26:29], off offset:16
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[18:21], off offset:48
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 ; GCN-SDAG-NEXT:    s_clause 0x3
 ; GCN-SDAG-NEXT:    scratch_load_b32 v43, off, s32
 ; GCN-SDAG-NEXT:    scratch_load_b32 v42, off, s32 offset:4
 ; GCN-SDAG-NEXT:    scratch_load_b32 v41, off, s32 offset:8
 ; GCN-SDAG-NEXT:    scratch_load_b32 v40, off, s32 offset:12
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0xc
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GCN-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
index 0dfeb3454dad5..cfbad1fa6742e 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
@@ -726,8 +726,8 @@ define amdgpu_ps <3 x i32> @s_uitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) {
 ; GFX9-LABEL: s_uitofp_v3i1_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s4, 1, s2
-; GFX9-NEXT:    s_and_b32 s2, 1, s1
-; GFX9-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX9-NEXT:    s_and_b32 s2, 1, s0
+; GFX9-NEXT:    s_bitcmp1_b32 s1, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -760,8 +760,8 @@ define amdgpu_ps <3 x i32> @s_uitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) {
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_uitofp_v3i1_to_v3bf16:
@@ -1178,8 +1178,8 @@ define amdgpu_ps <4 x i32> @s_uitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s6, 1, s3
 ; GFX9-NEXT:    s_and_b32 s4, 1, s2
-; GFX9-NEXT:    s_and_b32 s2, 1, s1
-; GFX9-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX9-NEXT:    s_and_b32 s2, 1, s0
+; GFX9-NEXT:    s_bitcmp1_b32 s1, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -1223,8 +1223,8 @@ define amdgpu_ps <4 x i32> @s_uitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) {
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_uitofp_v4i1_to_v4bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/literal64.ll b/llvm/test/CodeGen/AMDGPU/literal64.ll
index 768c9728554df..3d505837f5659 100644
--- a/llvm/test/CodeGen/AMDGPU/literal64.ll
+++ b/llvm/test/CodeGen/AMDGPU/literal64.ll
@@ -284,10 +284,10 @@ define amdgpu_ps <2 x float> @v_cndmask(double %a) {
 ; GCN-SDAG-LABEL: v_cndmask:
 ; GCN-SDAG:       ; %bb.0:
 ; GCN-SDAG-NEXT:    v_cmp_eq_f64_e32 vcc_lo, 0, v[0:1]
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40632000
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x40632000
+; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x40690333, v2, vcc_lo
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x33333333, 0, vcc_lo
-; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x40690333, v1, vcc_lo
 ; GCN-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GCN-GISEL-LABEL: v_cndmask:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index ff77d5ccbe312..0cbfd2942f13f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -75,10 +75,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-LABEL: test_mfma_f32_32x32x4bf16_1k:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, 1
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], 2
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x0
 ; GFX942-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x40
@@ -116,18 +115,18 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-NEXT:    v_accvgpr_write_b32 a30, s14
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, s15
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    s_nop 7
 ; GFX942-NEXT:    s_nop 7
 ; GFX942-NEXT:    s_nop 2
-; GFX942-NEXT:    global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
-; GFX942-NEXT:    global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
-; GFX942-NEXT:    global_store_dwordx4 v1, a[16:19], s[34:35] offset:64
-; GFX942-NEXT:    global_store_dwordx4 v1, a[20:23], s[34:35] offset:80
-; GFX942-NEXT:    global_store_dwordx4 v1, a[8:11], s[34:35] offset:32
-; GFX942-NEXT:    global_store_dwordx4 v1, a[12:15], s[34:35] offset:48
-; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[34:35]
-; GFX942-NEXT:    global_store_dwordx4 v1, a[4:7], s[34:35] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v4, a[24:27], s[34:35] offset:96
+; GFX942-NEXT:    global_store_dwordx4 v4, a[28:31], s[34:35] offset:112
+; GFX942-NEXT:    global_store_dwordx4 v4, a[16:19], s[34:35] offset:64
+; GFX942-NEXT:    global_store_dwordx4 v4, a[20:23], s[34:35] offset:80
+; GFX942-NEXT:    global_store_dwordx4 v4, a[8:11], s[34:35] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v4, a[12:15], s[34:35] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v4, a[0:3], s[34:35]
+; GFX942-NEXT:    global_store_dwordx4 v4, a[4:7], s[34:35] offset:16
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x4bf16_1k:
@@ -191,10 +190,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x4bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v33, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v34, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v35, v33
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v32, 2
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], 1
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[34:35], 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v36, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x0
 ; GFX942-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x40
@@ -232,18 +230,18 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, s14
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, s15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[32:33], v[34:35], v[0:31] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[16:19], s[34:35] offset:64
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[20:23], s[34:35] offset:80
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[8:11], s[34:35] offset:32
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[12:15], s[34:35] offset:48
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[0:3], s[34:35]
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[4:7], s[34:35] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v36, v[24:27], s[34:35] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v36, v[28:31], s[34:35] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v36, v[16:19], s[34:35] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v36, v[20:23], s[34:35] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v36, v[8:11], s[34:35] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v36, v[12:15], s[34:35] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v36, v[0:3], s[34:35]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v36, v[4:7], s[34:35] offset:16
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -294,10 +292,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-LABEL: test_mfma_f32_16x16x4bf16_1k:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, 1
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], 2
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
@@ -318,13 +314,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-NEXT:    v_accvgpr_write_b32 a14, s14
 ; GFX942-NEXT:    v_accvgpr_write_b32 a15, s15
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 2
-; GFX942-NEXT:    global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
-; GFX942-NEXT:    global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
-; GFX942-NEXT:    global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
-; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[16:17]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x4bf16_1k:
@@ -358,10 +355,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, v17
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 2
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], 1
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], 2
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -374,13 +369,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
@@ -415,10 +411,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0
 ; GFX942-LABEL: test_mfma_f32_4x4x4bf16_1k:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, 1
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], 2
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
@@ -427,9 +422,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a2, s2
 ; GFX942-NEXT:    v_accvgpr_write_b32 a3, s3
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_4x4x4_16b_bf16 a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    v_mfma_f32_4x4x4_16b_bf16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    s_nop 4
-; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX942-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
@@ -453,19 +448,18 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], 1
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -517,10 +511,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-LABEL: test_mfma_f32_32x32x8bf16_1k:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, 1
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], 2
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
@@ -541,13 +533,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-NEXT:    v_accvgpr_write_b32 a14, s14
 ; GFX942-NEXT:    v_accvgpr_write_b32 a15, s15
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x8_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    v_mfma_f32_32x32x8_bf16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 2
-; GFX942-NEXT:    global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
-; GFX942-NEXT:    global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
-; GFX942-NEXT:    global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
-; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[16:17]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x8bf16_1k:
@@ -582,10 +575,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, v17
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 2
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], 1
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], 2
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -598,13 +589,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x8_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x8_bf16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
@@ -640,10 +632,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX942-LABEL: test_mfma_f32_16x16x16bf16_1k:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, 1
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
-; GFX942-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], 2
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
@@ -652,9 +643,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX942-NEXT:    v_accvgpr_write_b32 a2, s2
 ; GFX942-NEXT:    v_accvgpr_write_b32 a3, s3
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_16x16x16_bf16 a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    v_mfma_f32_16x16x16_bf16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    s_nop 6
-; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX942-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
@@ -679,19 +670,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], 1
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 6
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -1310,16 +1300,19 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, 64
-; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT:    s_mov_b32 s4, 0
+; GFX942-NEXT:    s_mov_b32 s5, 64
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a0
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a0
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a1
+; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a0
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
@@ -1364,30 +1357,25 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 64
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX942-VGPR-NEXT:    s_mov_b32 s6, 0
+; GFX942-VGPR-NEXT:    s_mov_b32 s7, 64
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[8:9]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877906944 to double)), i32 0, i32 0, i32 0)
@@ -1429,14 +1417,14 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
 ; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-NEXT:    v_accvgpr_write_b32 a0, 64
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a1
+; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a1
+; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a0
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a0
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
@@ -1478,18 +1466,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 64
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX942-VGPR-NEXT:    s_mov_b32 s6, 64
+; GFX942-VGPR-NEXT:    s_mov_b32 s7, s6
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v0
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[8:9]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
@@ -1540,14 +1526,14 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
 ; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-NEXT:    v_accvgpr_write_b32 a0, 1.0
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a1
+; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a1
+; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a0
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a0
 ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
@@ -1589,18 +1575,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX942-VGPR-NEXT:    s_mov_b32 s6, 1.0
+; GFX942-VGPR-NEXT:    s_mov_b32 s7, s6
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v0
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[8:9]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
@@ -1650,18 +1634,19 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX942-NEXT:    v_mov_b32_e32 v3, s3
-; GFX942-NEXT:    v_accvgpr_mov_b32 a1, a0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 1.0
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v1
+; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a1
+; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a1
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v0
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a0
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a0
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
@@ -1706,29 +1691,22 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s3
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, s3
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
@@ -1767,20 +1745,22 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x405ec000
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX942-NEXT:    s_mov_b32 s6, 0
+; GFX942-NEXT:    s_mov_b32 s7, 0x405ec000
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX942-NEXT:    v_mov_b32_e32 v3, s3
-; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a1
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
@@ -1823,30 +1803,25 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX942-VGPR-NEXT:    s_mov_b32 s6, 0
+; GFX942-VGPR-NEXT:    s_mov_b32 s7, 0x405ec000
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s3
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, s3
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 78be949baabac..464216e9f5a2b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -5573,123 +5573,94 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ;
 ; GFX942-LABEL: test_mfma_f32_32x32x1f32_imm:
 ; GFX942:       ; %bb.0: ; %bb
-; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, 1.0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a8, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a9, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a10, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a11, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a12, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a13, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a14, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a15, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a16, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a17, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a18, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a19, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a20, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a21, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a22, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a23, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a24, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a25, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a26, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a27, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a28, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a29, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a30, a1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a31, a1
-; GFX942-NEXT:    v_mov_b32_e32 v2, 2.0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0x3f800000
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a27, a3
+; GFX942-NEXT:    v_accvgpr_mov_b32 a29, a3
+; GFX942-NEXT:    v_accvgpr_mov_b32 a31, a3
+; GFX942-NEXT:    v_accvgpr_mov_b32 a26, a2
+; GFX942-NEXT:    v_accvgpr_mov_b32 a28, a2
+; GFX942-NEXT:    v_accvgpr_mov_b32 a30, a2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_nop 7
 ; GFX942-NEXT:    s_nop 7
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942-NEXT:    global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT:    global_store_dwordx4 v2, a[28:31], s[0:1] offset:112
+; GFX942-NEXT:    global_store_dwordx4 v2, a[24:27], s[0:1] offset:96
+; GFX942-NEXT:    global_store_dwordx4 v2, a[20:23], s[0:1] offset:80
+; GFX942-NEXT:    global_store_dwordx4 v2, a[16:19], s[0:1] offset:64
+; GFX942-NEXT:    global_store_dwordx4 v2, a[12:15], s[0:1] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v2, a[8:11], s[0:1] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v2, a[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v2, a[0:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v14, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v15, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v20, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v21, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v22, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v23, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v24, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v25, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v26, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v27, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v28, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v29, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], v[30:31]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v34, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[30:31], v[28:29]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[28:29], v[26:27]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[26:27], v[24:25]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[24:25], v[22:23]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[22:23], v[20:21]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[20:21], v[18:19]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], v[16:17]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], v[14:15]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[12:13]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[10:11]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v33, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], 0
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], 0x3f800000
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[20:21], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[22:23], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[24:25], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[26:27], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[28:29], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[30:31], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v34, 2.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[22:25], s[0:1] offset:80
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[18:21], s[0:1] offset:64
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[14:17], s[0:1] offset:48
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[10:13], s[0:1] offset:32
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
index f971080e02c5b..090707eda3ca5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
@@ -561,17 +561,17 @@ define <2 x double> @v_maximum_v2f64(<2 x double> %src0, <2 x double> %src1) {
 ; GFX950-LABEL: v_maximum_v2f64:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT:    v_max_f64 v[4:5], v[2:3], v[6:7]
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX950-NEXT:    v_max_f64 v[8:9], v[2:3], v[6:7]
+; GFX950-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[6:7]
+; GFX950-NEXT:    v_max_f64 v[6:7], v[0:1], v[4:5]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v7, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f64:
@@ -630,12 +630,19 @@ define <2 x double> @v_maximum_v2f64__nnan(<2 x double> %src0, <2 x double> %src
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v2f64__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v2f64__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v2f64__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f64__nnan:
 ; GFX10:       ; %bb.0:
@@ -711,17 +718,17 @@ define <2 x double> @v_maximum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1
 ; GFX950-LABEL: v_maximum_v2f64__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT:    v_max_f64 v[4:5], v[2:3], v[6:7]
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX950-NEXT:    v_max_f64 v[8:9], v[2:3], v[6:7]
+; GFX950-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[6:7]
+; GFX950-NEXT:    v_max_f64 v[6:7], v[0:1], v[4:5]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v7, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f64__nsz:
@@ -780,12 +787,19 @@ define <2 x double> @v_maximum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double>
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v2f64__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v2f64__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v2f64__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1008,22 +1022,22 @@ define <3 x double> @v_maximum_v3f64(<3 x double> %src0, <3 x double> %src1) {
 ; GFX950-LABEL: v_maximum_v3f64:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[12:13], v[0:1], v[6:7]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
-; GFX950-NEXT:    v_max_f64 v[6:7], v[2:3], v[8:9]
+; GFX950-NEXT:    v_max_f64 v[12:13], v[4:5], v[10:11]
+; GFX950-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[10:11], v[2:3], v[8:9]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v12, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v13, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v13, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v12, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v11, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v10, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc
-; GFX950-NEXT:    v_max_f64 v[6:7], v[4:5], v[10:11]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f64:
@@ -1092,13 +1106,21 @@ define <3 x double> @v_maximum_v3f64__nnan(<3 x double> %src0, <3 x double> %src
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v3f64__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
-; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v3f64__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX900-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v3f64__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f64__nnan:
 ; GFX10:       ; %bb.0:
@@ -1189,22 +1211,22 @@ define <3 x double> @v_maximum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1
 ; GFX950-LABEL: v_maximum_v3f64__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[12:13], v[0:1], v[6:7]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
-; GFX950-NEXT:    v_max_f64 v[6:7], v[2:3], v[8:9]
+; GFX950-NEXT:    v_max_f64 v[12:13], v[4:5], v[10:11]
+; GFX950-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[10:11], v[2:3], v[8:9]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v12, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v13, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v13, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v12, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v11, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v10, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc
-; GFX950-NEXT:    v_max_f64 v[6:7], v[4:5], v[10:11]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f64__nsz:
@@ -1273,13 +1295,21 @@ define <3 x double> @v_maximum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double>
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v3f64__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
-; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v3f64__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX900-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v3f64__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1382,27 +1412,27 @@ define <4 x double> @v_maximum_v4f64(<4 x double> %src0, <4 x double> %src1) {
 ; GFX950-LABEL: v_maximum_v4f64:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
-; GFX950-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[16:17], v[6:7], v[14:15]
+; GFX950-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
+; GFX950-NEXT:    v_max_f64 v[14:15], v[4:5], v[12:13]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v16, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v16, vcc
-; GFX950-NEXT:    v_max_f64 v[8:9], v[4:5], v[12:13]
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v17, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v16, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[12:13]
+; GFX950-NEXT:    v_max_f64 v[12:13], v[2:3], v[10:11]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v15, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v14, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[10:11], v[0:1], v[8:9]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v13, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v12, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v9, v16, vcc
-; GFX950-NEXT:    v_max_f64 v[8:9], v[6:7], v[14:15]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v16, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v11, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v10, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f64:
@@ -1482,14 +1512,23 @@ define <4 x double> @v_maximum_v4f64__nnan(<4 x double> %src0, <4 x double> %src
 ; GFX8-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v4f64__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
-; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
-; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v4f64__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX900-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX900-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v4f64__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f64__nnan:
 ; GFX10:       ; %bb.0:
@@ -1595,27 +1634,27 @@ define <4 x double> @v_maximum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1
 ; GFX950-LABEL: v_maximum_v4f64__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
-; GFX950-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[16:17], v[6:7], v[14:15]
+; GFX950-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
+; GFX950-NEXT:    v_max_f64 v[14:15], v[4:5], v[12:13]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v16, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v16, vcc
-; GFX950-NEXT:    v_max_f64 v[8:9], v[4:5], v[12:13]
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v17, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v16, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[12:13]
+; GFX950-NEXT:    v_max_f64 v[12:13], v[2:3], v[10:11]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v15, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v14, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[10:11], v[0:1], v[8:9]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v13, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v12, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v9, v16, vcc
-; GFX950-NEXT:    v_max_f64 v[8:9], v[6:7], v[14:15]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v16, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v11, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v10, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f64__nsz:
@@ -1695,14 +1734,23 @@ define <4 x double> @v_maximum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double>
 ; GFX8-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v4f64__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
-; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
-; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v4f64__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX900-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX900-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v4f64__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1864,43 +1912,43 @@ define <8 x double> @v_maximum_v8f64(<8 x double> %src0, <8 x double> %src1) {
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    scratch_load_dword v31, off, s32
 ; GFX950-NEXT:    v_mov_b32_e32 v54, 0x7ff80000
-; GFX950-NEXT:    v_max_f64 v[32:33], v[0:1], v[16:17]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[16:17]
-; GFX950-NEXT:    v_max_f64 v[34:35], v[2:3], v[18:19]
-; GFX950-NEXT:    v_max_f64 v[36:37], v[4:5], v[20:21]
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v32, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v33, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[18:19]
+; GFX950-NEXT:    v_max_f64 v[32:33], v[12:13], v[28:29]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[28:29]
+; GFX950-NEXT:    v_max_f64 v[34:35], v[10:11], v[26:27]
+; GFX950-NEXT:    v_max_f64 v[36:37], v[8:9], v[24:25]
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v33, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v12, v32, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[26:27]
 ; GFX950-NEXT:    v_max_f64 v[38:39], v[6:7], v[22:23]
-; GFX950-NEXT:    v_max_f64 v[48:49], v[8:9], v[24:25]
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v34, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v35, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[20:21]
-; GFX950-NEXT:    v_max_f64 v[50:51], v[10:11], v[26:27]
-; GFX950-NEXT:    v_max_f64 v[52:53], v[12:13], v[28:29]
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v36, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v37, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[22:23]
-; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[16:17], v[14:15], v[30:31]
-; GFX950-NEXT:    v_cndmask_b32_e64 v6, v38, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v39, v54, vcc
+; GFX950-NEXT:    v_max_f64 v[48:49], v[4:5], v[20:21]
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v35, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v10, v34, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[24:25]
+; GFX950-NEXT:    v_max_f64 v[50:51], v[2:3], v[18:19]
+; GFX950-NEXT:    v_max_f64 v[52:53], v[0:1], v[16:17]
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v37, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v8, v36, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[22:23]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v8, v48, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v49, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[26:27]
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v39, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v38, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[20:21]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v10, v50, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v51, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[28:29]
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v49, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v48, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[18:19]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v12, v52, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v53, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v51, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v50, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[16:17]
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[16:17], v[14:15], v[30:31]
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v53, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v52, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[30:31]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v14, v16, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e32 v15, v17, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v14, v16, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v8f64:
@@ -2371,152 +2419,144 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX950-NEXT:    v_accvgpr_write_b32 a3, v43 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a4, v44 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a5, v45 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a8, v56 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a9, v57 ; Reload Reuse
-; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:8
-; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:4
-; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:16
-; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:12
-; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:24
-; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:20
+; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:120
+; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:116
 ; GFX950-NEXT:    scratch_load_dword v57, off, s32 offset:32
 ; GFX950-NEXT:    scratch_load_dword v56, off, s32 offset:28
-; GFX950-NEXT:    scratch_load_dword v47, off, s32 offset:40
-; GFX950-NEXT:    scratch_load_dword v46, off, s32 offset:36
-; GFX950-NEXT:    scratch_load_dword v45, off, s32 offset:48
-; GFX950-NEXT:    scratch_load_dword v44, off, s32 offset:44
-; GFX950-NEXT:    scratch_load_dword v43, off, s32 offset:56
-; GFX950-NEXT:    scratch_load_dword v42, off, s32 offset:52
-; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:64
-; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:60
-; GFX950-NEXT:    scratch_load_dword v55, off, s32 offset:72
-; GFX950-NEXT:    scratch_load_dword v54, off, s32 offset:68
-; GFX950-NEXT:    scratch_load_dword v53, off, s32 offset:80
-; GFX950-NEXT:    scratch_load_dword v52, off, s32 offset:76
-; GFX950-NEXT:    scratch_load_dword v51, off, s32 offset:88
-; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:84
-; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:96
-; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:92
+; GFX950-NEXT:    scratch_load_dword v45, off, s32 offset:24
+; GFX950-NEXT:    scratch_load_dword v44, off, s32 offset:20
+; GFX950-NEXT:    scratch_load_dword v43, off, s32 offset:16
+; GFX950-NEXT:    scratch_load_dword v42, off, s32 offset:12
+; GFX950-NEXT:    scratch_load_dword v53, off, s32 offset:8
+; GFX950-NEXT:    scratch_load_dword v52, off, s32 offset:4
+; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:112
 ; GFX950-NEXT:    scratch_load_dword v31, off, s32
-; GFX950-NEXT:    scratch_load_dword v35, off, s32 offset:104
-; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:100
+; GFX950-NEXT:    scratch_load_dword v55, off, s32 offset:128
+; GFX950-NEXT:    scratch_load_dword v54, off, s32 offset:124
+; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:108
+; GFX950-NEXT:    scratch_load_dword v51, off, s32 offset:104
+; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:100
+; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:96
+; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:92
+; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:88
+; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:84
+; GFX950-NEXT:    scratch_load_dword v35, off, s32 offset:80
+; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:76
+; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:72
+; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:68
+; GFX950-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a11, v59 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a13, v61 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a14, v62 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a15, v63 ; Reload Reuse
-; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_max_f64 v[58:59], v[0:1], v[32:33]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[32:33]
-; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:112
-; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:108
-; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_max_f64 v[60:61], v[2:3], v[36:37]
-; GFX950-NEXT:    v_cmp_u_f64_e64 s[0:1], v[2:3], v[36:37]
-; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:120
-; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:116
-; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_max_f64 v[62:63], v[4:5], v[38:39]
-; GFX950-NEXT:    v_cmp_u_f64_e64 s[2:3], v[4:5], v[38:39]
-; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:128
-; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:124
-; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7ff80000
-; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[6:7], v[56:57]
-; GFX950-NEXT:    v_cmp_u_f64_e64 s[4:5], v[6:7], v[56:57]
+; GFX950-NEXT:    v_mov_b32_e32 v60, 0x7ff80000
 ; GFX950-NEXT:    s_waitcnt vmcnt(23)
-; GFX950-NEXT:    v_max_f64 v[56:57], v[8:9], v[46:47]
-; GFX950-NEXT:    v_cndmask_b32_e64 v58, v58, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v59, v59, v2, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[46:47]
-; GFX950-NEXT:    v_cndmask_b32_e64 v6, v0, 0, s[4:5]
-; GFX950-NEXT:    v_cndmask_b32_e64 v7, v1, v2, s[4:5]
-; GFX950-NEXT:    v_cndmask_b32_e64 v8, v56, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v57, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(21)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[10:11], v[44:45]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[44:45]
-; GFX950-NEXT:    v_cndmask_b32_e64 v60, v60, 0, s[0:1]
-; GFX950-NEXT:    v_cndmask_b32_e64 v3, v61, v2, s[0:1]
-; GFX950-NEXT:    v_cndmask_b32_e64 v10, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v1, v2, vcc
+; GFX950-NEXT:    v_max_f64 v[46:47], v[28:29], v[40:41]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[28:29], v[40:41]
+; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:64
+; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:60
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_max_f64 v[58:59], v[6:7], v[56:57]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[0:1], v[6:7], v[56:57]
+; GFX950-NEXT:    scratch_load_dword v7, off, s32 offset:56
+; GFX950-NEXT:    scratch_load_dword v6, off, s32 offset:52
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_max_f64 v[56:57], v[4:5], v[44:45]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[2:3], v[4:5], v[44:45]
+; GFX950-NEXT:    scratch_load_dword v5, off, s32 offset:48
+; GFX950-NEXT:    scratch_load_dword v4, off, s32 offset:44
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_max_f64 v[44:45], v[2:3], v[42:43]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[42:43]
+; GFX950-NEXT:    scratch_load_dword v3, off, s32 offset:40
+; GFX950-NEXT:    scratch_load_dword v2, off, s32 offset:36
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_max_f64 v[42:43], v[0:1], v[52:53]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[6:7], v[0:1], v[52:53]
 ; GFX950-NEXT:    s_waitcnt vmcnt(19)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[12:13], v[42:43]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[42:43]
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v62, 0, s[2:3]
-; GFX950-NEXT:    v_cndmask_b32_e64 v5, v63, v2, s[2:3]
-; GFX950-NEXT:    v_cndmask_b32_e64 v12, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(17)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[14:15], v[40:41]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[40:41]
-; GFX950-NEXT:    v_accvgpr_read_b32 v63, a15 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v62, a14 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v14, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(15)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[16:17], v[54:55]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[16:17], v[54:55]
-; GFX950-NEXT:    v_accvgpr_read_b32 v61, a13 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v16, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(13)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[18:19], v[52:53]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[18:19], v[52:53]
-; GFX950-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX950-NEXT:    v_max_f64 v[0:1], v[30:31], v[54:55]
+; GFX950-NEXT:    s_waitcnt vmcnt(18)
+; GFX950-NEXT:    v_max_f64 v[52:53], v[26:27], v[48:49]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[8:9], v[30:31], v[54:55]
+; GFX950-NEXT:    v_cndmask_b32_e32 v29, v47, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v28, v46, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[26:27], v[48:49]
+; GFX950-NEXT:    v_cndmask_b32_e64 v31, v1, v60, s[8:9]
+; GFX950-NEXT:    v_cndmask_b32_e64 v30, v0, 0, s[8:9]
+; GFX950-NEXT:    v_cndmask_b32_e32 v27, v53, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v26, v52, 0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(16)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[24:25], v[50:51]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[24:25], v[50:51]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v47, a7 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v18, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(11)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[20:21], v[50:51]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[20:21], v[50:51]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v46, a6 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v20, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(9)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[22:23], v[48:49]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[22:23], v[48:49]
-; GFX950-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v25, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v24, v0, 0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(14)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[22:23], v[38:39]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[22:23], v[38:39]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v1, v60, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e64 v22, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v1, v2, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(12)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[20:21], v[36:37]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[20:21], v[36:37]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v20, v0, 0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(10)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[18:19], v[34:35]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[18:19], v[34:35]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v18, v0, 0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(8)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[16:17], v[32:33]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[16:17], v[32:33]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v16, v0, 0, vcc
 ; GFX950-NEXT:    s_waitcnt vmcnt(6)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[24:25], v[34:35]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[24:25], v[34:35]
-; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT:    v_max_f64 v[0:1], v[14:15], v[40:41]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[40:41]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v24, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v25, v1, v2, vcc
 ; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v14, v0, 0, vcc
 ; GFX950-NEXT:    s_waitcnt vmcnt(4)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[26:27], v[32:33]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[26:27], v[32:33]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v26, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v27, v1, v2, vcc
+; GFX950-NEXT:    v_max_f64 v[0:1], v[12:13], v[6:7]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[6:7]
+; GFX950-NEXT:    v_cndmask_b32_e64 v7, v59, v60, s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v58, 0, s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v12, v0, 0, vcc
 ; GFX950-NEXT:    s_waitcnt vmcnt(2)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[28:29], v[36:37]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[28:29], v[36:37]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v28, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v29, v1, v2, vcc
+; GFX950-NEXT:    v_max_f64 v[0:1], v[10:11], v[4:5]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[4:5]
+; GFX950-NEXT:    v_cndmask_b32_e64 v5, v57, v60, s[2:3]
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v56, 0, s[2:3]
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v10, v0, 0, vcc
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_max_f64 v[0:1], v[30:31], v[38:39]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[30:31], v[38:39]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v30, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v1, v2, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v0, v58
-; GFX950-NEXT:    v_mov_b32_e32 v1, v59
-; GFX950-NEXT:    v_mov_b32_e32 v2, v60
+; GFX950-NEXT:    v_max_f64 v[0:1], v[8:9], v[2:3]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[2:3]
+; GFX950-NEXT:    v_cndmask_b32_e64 v3, v45, v60, s[4:5]
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v44, 0, s[4:5]
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v8, v0, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v1, v43, v60, s[6:7]
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v42, 0, s[6:7]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_read_b32 v59, a11 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v16f64:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
index dfd67873c3b86..b119dd425463b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
@@ -561,17 +561,17 @@ define <2 x double> @v_minimum_v2f64(<2 x double> %src0, <2 x double> %src1) {
 ; GFX950-LABEL: v_minimum_v2f64:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT:    v_min_f64 v[4:5], v[2:3], v[6:7]
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX950-NEXT:    v_min_f64 v[8:9], v[2:3], v[6:7]
+; GFX950-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[6:7]
+; GFX950-NEXT:    v_min_f64 v[6:7], v[0:1], v[4:5]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v7, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f64:
@@ -630,12 +630,19 @@ define <2 x double> @v_minimum_v2f64__nnan(<2 x double> %src0, <2 x double> %src
 ; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v2f64__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v2f64__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v2f64__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f64__nnan:
 ; GFX10:       ; %bb.0:
@@ -711,17 +718,17 @@ define <2 x double> @v_minimum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1
 ; GFX950-LABEL: v_minimum_v2f64__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT:    v_min_f64 v[4:5], v[2:3], v[6:7]
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX950-NEXT:    v_min_f64 v[8:9], v[2:3], v[6:7]
+; GFX950-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[6:7]
+; GFX950-NEXT:    v_min_f64 v[6:7], v[0:1], v[4:5]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v7, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f64__nsz:
@@ -780,12 +787,19 @@ define <2 x double> @v_minimum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double>
 ; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v2f64__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v2f64__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v2f64__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1008,22 +1022,22 @@ define <3 x double> @v_minimum_v3f64(<3 x double> %src0, <3 x double> %src1) {
 ; GFX950-LABEL: v_minimum_v3f64:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f64 v[12:13], v[0:1], v[6:7]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
-; GFX950-NEXT:    v_min_f64 v[6:7], v[2:3], v[8:9]
+; GFX950-NEXT:    v_min_f64 v[12:13], v[4:5], v[10:11]
+; GFX950-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[10:11], v[2:3], v[8:9]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v12, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v13, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v13, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v12, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v11, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v10, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc
-; GFX950-NEXT:    v_min_f64 v[6:7], v[4:5], v[10:11]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f64:
@@ -1092,13 +1106,21 @@ define <3 x double> @v_minimum_v3f64__nnan(<3 x double> %src0, <3 x double> %src
 ; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v3f64__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
-; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v3f64__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX900-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v3f64__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f64__nnan:
 ; GFX10:       ; %bb.0:
@@ -1189,22 +1211,22 @@ define <3 x double> @v_minimum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1
 ; GFX950-LABEL: v_minimum_v3f64__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f64 v[12:13], v[0:1], v[6:7]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
-; GFX950-NEXT:    v_min_f64 v[6:7], v[2:3], v[8:9]
+; GFX950-NEXT:    v_min_f64 v[12:13], v[4:5], v[10:11]
+; GFX950-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[10:11], v[2:3], v[8:9]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v12, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v13, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v13, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v12, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v11, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v10, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc
-; GFX950-NEXT:    v_min_f64 v[6:7], v[4:5], v[10:11]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v14, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f64__nsz:
@@ -1273,13 +1295,21 @@ define <3 x double> @v_minimum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double>
 ; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v3f64__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
-; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v3f64__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX900-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v3f64__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1382,27 +1412,27 @@ define <4 x double> @v_minimum_v4f64(<4 x double> %src0, <4 x double> %src1) {
 ; GFX950-LABEL: v_minimum_v4f64:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
-; GFX950-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[16:17], v[6:7], v[14:15]
+; GFX950-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
+; GFX950-NEXT:    v_min_f64 v[14:15], v[4:5], v[12:13]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v16, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v16, vcc
-; GFX950-NEXT:    v_min_f64 v[8:9], v[4:5], v[12:13]
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v17, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v16, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[12:13]
+; GFX950-NEXT:    v_min_f64 v[12:13], v[2:3], v[10:11]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v15, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v14, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[10:11], v[0:1], v[8:9]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v13, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v12, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v9, v16, vcc
-; GFX950-NEXT:    v_min_f64 v[8:9], v[6:7], v[14:15]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v16, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v11, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v10, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f64:
@@ -1482,14 +1512,23 @@ define <4 x double> @v_minimum_v4f64__nnan(<4 x double> %src0, <4 x double> %src
 ; GFX8-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v4f64__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
-; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
-; GFX9-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v4f64__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX900-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX900-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v4f64__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f64__nnan:
 ; GFX10:       ; %bb.0:
@@ -1595,27 +1634,27 @@ define <4 x double> @v_minimum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1
 ; GFX950-LABEL: v_minimum_v4f64__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
-; GFX950-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[16:17], v[6:7], v[14:15]
+; GFX950-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
+; GFX950-NEXT:    v_min_f64 v[14:15], v[4:5], v[12:13]
 ; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v16, 0x7ff80000
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v16, vcc
-; GFX950-NEXT:    v_min_f64 v[8:9], v[4:5], v[12:13]
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v17, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v16, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[12:13]
+; GFX950-NEXT:    v_min_f64 v[12:13], v[2:3], v[10:11]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v15, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v14, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[10:11], v[0:1], v[8:9]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v13, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v12, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v9, v16, vcc
-; GFX950-NEXT:    v_min_f64 v[8:9], v[6:7], v[14:15]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v16, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v11, v18, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v10, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f64__nsz:
@@ -1695,14 +1734,23 @@ define <4 x double> @v_minimum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double>
 ; GFX8-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v4f64__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
-; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
-; GFX9-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v4f64__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX900-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX900-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v4f64__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1864,43 +1912,43 @@ define <8 x double> @v_minimum_v8f64(<8 x double> %src0, <8 x double> %src1) {
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    scratch_load_dword v31, off, s32
 ; GFX950-NEXT:    v_mov_b32_e32 v54, 0x7ff80000
-; GFX950-NEXT:    v_min_f64 v[32:33], v[0:1], v[16:17]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[16:17]
-; GFX950-NEXT:    v_min_f64 v[34:35], v[2:3], v[18:19]
-; GFX950-NEXT:    v_min_f64 v[36:37], v[4:5], v[20:21]
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v32, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v33, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[18:19]
+; GFX950-NEXT:    v_min_f64 v[32:33], v[12:13], v[28:29]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[28:29]
+; GFX950-NEXT:    v_min_f64 v[34:35], v[10:11], v[26:27]
+; GFX950-NEXT:    v_min_f64 v[36:37], v[8:9], v[24:25]
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v33, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v12, v32, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[26:27]
 ; GFX950-NEXT:    v_min_f64 v[38:39], v[6:7], v[22:23]
-; GFX950-NEXT:    v_min_f64 v[48:49], v[8:9], v[24:25]
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v34, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v35, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[20:21]
-; GFX950-NEXT:    v_min_f64 v[50:51], v[10:11], v[26:27]
-; GFX950-NEXT:    v_min_f64 v[52:53], v[12:13], v[28:29]
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v36, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v37, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[22:23]
-; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_min_f64 v[16:17], v[14:15], v[30:31]
-; GFX950-NEXT:    v_cndmask_b32_e64 v6, v38, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v39, v54, vcc
+; GFX950-NEXT:    v_min_f64 v[48:49], v[4:5], v[20:21]
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v35, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v10, v34, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[24:25]
+; GFX950-NEXT:    v_min_f64 v[50:51], v[2:3], v[18:19]
+; GFX950-NEXT:    v_min_f64 v[52:53], v[0:1], v[16:17]
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v37, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v8, v36, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[22:23]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v8, v48, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v49, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[26:27]
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v39, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v38, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[20:21]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v10, v50, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v51, v54, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[28:29]
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v49, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v48, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[18:19]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v12, v52, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v53, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v51, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v50, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[16:17]
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[16:17], v[14:15], v[30:31]
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v53, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v52, 0, vcc
 ; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[30:31]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v14, v16, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e32 v15, v17, v54, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v14, v16, 0, vcc
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v8f64:
@@ -2371,152 +2419,144 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX950-NEXT:    v_accvgpr_write_b32 a3, v43 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a4, v44 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a5, v45 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a8, v56 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a9, v57 ; Reload Reuse
-; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:8
-; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:4
-; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:16
-; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:12
-; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:24
-; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:20
+; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:120
+; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:116
 ; GFX950-NEXT:    scratch_load_dword v57, off, s32 offset:32
 ; GFX950-NEXT:    scratch_load_dword v56, off, s32 offset:28
-; GFX950-NEXT:    scratch_load_dword v47, off, s32 offset:40
-; GFX950-NEXT:    scratch_load_dword v46, off, s32 offset:36
-; GFX950-NEXT:    scratch_load_dword v45, off, s32 offset:48
-; GFX950-NEXT:    scratch_load_dword v44, off, s32 offset:44
-; GFX950-NEXT:    scratch_load_dword v43, off, s32 offset:56
-; GFX950-NEXT:    scratch_load_dword v42, off, s32 offset:52
-; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:64
-; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:60
-; GFX950-NEXT:    scratch_load_dword v55, off, s32 offset:72
-; GFX950-NEXT:    scratch_load_dword v54, off, s32 offset:68
-; GFX950-NEXT:    scratch_load_dword v53, off, s32 offset:80
-; GFX950-NEXT:    scratch_load_dword v52, off, s32 offset:76
-; GFX950-NEXT:    scratch_load_dword v51, off, s32 offset:88
-; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:84
-; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:96
-; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:92
+; GFX950-NEXT:    scratch_load_dword v45, off, s32 offset:24
+; GFX950-NEXT:    scratch_load_dword v44, off, s32 offset:20
+; GFX950-NEXT:    scratch_load_dword v43, off, s32 offset:16
+; GFX950-NEXT:    scratch_load_dword v42, off, s32 offset:12
+; GFX950-NEXT:    scratch_load_dword v53, off, s32 offset:8
+; GFX950-NEXT:    scratch_load_dword v52, off, s32 offset:4
+; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:112
 ; GFX950-NEXT:    scratch_load_dword v31, off, s32
-; GFX950-NEXT:    scratch_load_dword v35, off, s32 offset:104
-; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:100
+; GFX950-NEXT:    scratch_load_dword v55, off, s32 offset:128
+; GFX950-NEXT:    scratch_load_dword v54, off, s32 offset:124
+; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:108
+; GFX950-NEXT:    scratch_load_dword v51, off, s32 offset:104
+; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:100
+; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:96
+; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:92
+; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:88
+; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:84
+; GFX950-NEXT:    scratch_load_dword v35, off, s32 offset:80
+; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:76
+; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:72
+; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:68
+; GFX950-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a11, v59 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a13, v61 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a14, v62 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_write_b32 a15, v63 ; Reload Reuse
-; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_min_f64 v[58:59], v[0:1], v[32:33]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[32:33]
-; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:112
-; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:108
-; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_min_f64 v[60:61], v[2:3], v[36:37]
-; GFX950-NEXT:    v_cmp_u_f64_e64 s[0:1], v[2:3], v[36:37]
-; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:120
-; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:116
-; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_min_f64 v[62:63], v[4:5], v[38:39]
-; GFX950-NEXT:    v_cmp_u_f64_e64 s[2:3], v[4:5], v[38:39]
-; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:128
-; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:124
-; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7ff80000
-; GFX950-NEXT:    s_waitcnt vmcnt(25)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[6:7], v[56:57]
-; GFX950-NEXT:    v_cmp_u_f64_e64 s[4:5], v[6:7], v[56:57]
+; GFX950-NEXT:    v_mov_b32_e32 v60, 0x7ff80000
 ; GFX950-NEXT:    s_waitcnt vmcnt(23)
-; GFX950-NEXT:    v_min_f64 v[56:57], v[8:9], v[46:47]
-; GFX950-NEXT:    v_cndmask_b32_e64 v58, v58, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v59, v59, v2, vcc
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[46:47]
-; GFX950-NEXT:    v_cndmask_b32_e64 v6, v0, 0, s[4:5]
-; GFX950-NEXT:    v_cndmask_b32_e64 v7, v1, v2, s[4:5]
-; GFX950-NEXT:    v_cndmask_b32_e64 v8, v56, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v57, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(21)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[10:11], v[44:45]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[44:45]
-; GFX950-NEXT:    v_cndmask_b32_e64 v60, v60, 0, s[0:1]
-; GFX950-NEXT:    v_cndmask_b32_e64 v3, v61, v2, s[0:1]
-; GFX950-NEXT:    v_cndmask_b32_e64 v10, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v1, v2, vcc
+; GFX950-NEXT:    v_min_f64 v[46:47], v[28:29], v[40:41]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[28:29], v[40:41]
+; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:64
+; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:60
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_min_f64 v[58:59], v[6:7], v[56:57]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[0:1], v[6:7], v[56:57]
+; GFX950-NEXT:    scratch_load_dword v7, off, s32 offset:56
+; GFX950-NEXT:    scratch_load_dword v6, off, s32 offset:52
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_min_f64 v[56:57], v[4:5], v[44:45]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[2:3], v[4:5], v[44:45]
+; GFX950-NEXT:    scratch_load_dword v5, off, s32 offset:48
+; GFX950-NEXT:    scratch_load_dword v4, off, s32 offset:44
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_min_f64 v[44:45], v[2:3], v[42:43]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[42:43]
+; GFX950-NEXT:    scratch_load_dword v3, off, s32 offset:40
+; GFX950-NEXT:    scratch_load_dword v2, off, s32 offset:36
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_min_f64 v[42:43], v[0:1], v[52:53]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[6:7], v[0:1], v[52:53]
 ; GFX950-NEXT:    s_waitcnt vmcnt(19)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[12:13], v[42:43]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[42:43]
-; GFX950-NEXT:    v_cndmask_b32_e64 v4, v62, 0, s[2:3]
-; GFX950-NEXT:    v_cndmask_b32_e64 v5, v63, v2, s[2:3]
-; GFX950-NEXT:    v_cndmask_b32_e64 v12, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(17)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[14:15], v[40:41]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[40:41]
-; GFX950-NEXT:    v_accvgpr_read_b32 v63, a15 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v62, a14 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v14, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(15)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[16:17], v[54:55]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[16:17], v[54:55]
-; GFX950-NEXT:    v_accvgpr_read_b32 v61, a13 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v16, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v17, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(13)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[18:19], v[52:53]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[18:19], v[52:53]
-; GFX950-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX950-NEXT:    v_min_f64 v[0:1], v[30:31], v[54:55]
+; GFX950-NEXT:    s_waitcnt vmcnt(18)
+; GFX950-NEXT:    v_min_f64 v[52:53], v[26:27], v[48:49]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[8:9], v[30:31], v[54:55]
+; GFX950-NEXT:    v_cndmask_b32_e32 v29, v47, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v28, v46, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[26:27], v[48:49]
+; GFX950-NEXT:    v_cndmask_b32_e64 v31, v1, v60, s[8:9]
+; GFX950-NEXT:    v_cndmask_b32_e64 v30, v0, 0, s[8:9]
+; GFX950-NEXT:    v_cndmask_b32_e32 v27, v53, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v26, v52, 0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(16)
+; GFX950-NEXT:    v_min_f64 v[0:1], v[24:25], v[50:51]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[24:25], v[50:51]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v47, a7 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v18, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v19, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(11)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[20:21], v[50:51]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[20:21], v[50:51]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v46, a6 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v20, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v21, v1, v2, vcc
-; GFX950-NEXT:    s_waitcnt vmcnt(9)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[22:23], v[48:49]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[22:23], v[48:49]
-; GFX950-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
-; GFX950-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v25, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v24, v0, 0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(14)
+; GFX950-NEXT:    v_min_f64 v[0:1], v[22:23], v[38:39]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[22:23], v[38:39]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v1, v60, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e64 v22, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v23, v1, v2, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(12)
+; GFX950-NEXT:    v_min_f64 v[0:1], v[20:21], v[36:37]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[20:21], v[36:37]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v20, v0, 0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(10)
+; GFX950-NEXT:    v_min_f64 v[0:1], v[18:19], v[34:35]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[18:19], v[34:35]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v18, v0, 0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(8)
+; GFX950-NEXT:    v_min_f64 v[0:1], v[16:17], v[32:33]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[16:17], v[32:33]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v16, v0, 0, vcc
 ; GFX950-NEXT:    s_waitcnt vmcnt(6)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[24:25], v[34:35]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[24:25], v[34:35]
-; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT:    v_min_f64 v[0:1], v[14:15], v[40:41]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[40:41]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
-; GFX950-NEXT:    v_cndmask_b32_e64 v24, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v25, v1, v2, vcc
 ; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v14, v0, 0, vcc
 ; GFX950-NEXT:    s_waitcnt vmcnt(4)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[26:27], v[32:33]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[26:27], v[32:33]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v26, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v27, v1, v2, vcc
+; GFX950-NEXT:    v_min_f64 v[0:1], v[12:13], v[6:7]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[6:7]
+; GFX950-NEXT:    v_cndmask_b32_e64 v7, v59, v60, s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v58, 0, s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v12, v0, 0, vcc
 ; GFX950-NEXT:    s_waitcnt vmcnt(2)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[28:29], v[36:37]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[28:29], v[36:37]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v28, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v29, v1, v2, vcc
+; GFX950-NEXT:    v_min_f64 v[0:1], v[10:11], v[4:5]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[4:5]
+; GFX950-NEXT:    v_cndmask_b32_e64 v5, v57, v60, s[2:3]
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v56, 0, s[2:3]
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v10, v0, 0, vcc
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_min_f64 v[0:1], v[30:31], v[38:39]
-; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[30:31], v[38:39]
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v30, v0, 0, vcc
-; GFX950-NEXT:    v_cndmask_b32_e32 v31, v1, v2, vcc
-; GFX950-NEXT:    v_mov_b32_e32 v0, v58
-; GFX950-NEXT:    v_mov_b32_e32 v1, v59
-; GFX950-NEXT:    v_mov_b32_e32 v2, v60
+; GFX950-NEXT:    v_min_f64 v[0:1], v[8:9], v[2:3]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[2:3]
+; GFX950-NEXT:    v_cndmask_b32_e64 v3, v45, v60, s[4:5]
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v44, 0, s[4:5]
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v1, v60, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v8, v0, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v1, v43, v60, s[6:7]
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v42, 0, s[6:7]
 ; GFX950-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_read_b32 v59, a11 ; Reload Reuse
 ; GFX950-NEXT:    v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v16f64:
diff --git a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
index 3b855a56a5abb..beff113dc0661 100644
--- a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
+++ b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
@@ -7,11 +7,11 @@ define <2 x i32> @uniform_masked_load_ptr1_mask_v2i32(ptr addrspace(1) inreg noc
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1]
 ; GFX942-NEXT:  .LBB0_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
@@ -30,13 +30,12 @@ define <4 x i32> @uniform_masked_load_ptr1_mask_v4i32(ptr addrspace(1) inreg noc
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
 ; GFX942-NEXT:  .LBB1_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
@@ -55,13 +54,12 @@ define <4 x float> @uniform_masked_load_ptr1_mask_v4f32(ptr addrspace(1) inreg n
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
 ; GFX942-NEXT:  .LBB2_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
@@ -80,20 +78,16 @@ define <8 x i32> @uniform_masked_load_ptr1_mask_v8i32(ptr addrspace(1) inreg noc
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
-; GFX942-NEXT:    global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v8, s[0:1]
 ; GFX942-NEXT:  .LBB3_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -111,20 +105,16 @@ define <8 x float> @uniform_masked_load_ptr1_mask_v8f32(ptr addrspace(1) inreg n
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
-; GFX942-NEXT:    global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v8, s[0:1]
 ; GFX942-NEXT:  .LBB4_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -142,13 +132,12 @@ define <8 x i16> @uniform_masked_load_ptr1_mask_v8i16(ptr addrspace(1) inreg noc
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
 ; GFX942-NEXT:  .LBB5_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
@@ -167,13 +156,12 @@ define <8 x half> @uniform_masked_load_ptr1_mask_v8f16(ptr addrspace(1) inreg no
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
 ; GFX942-NEXT:  .LBB6_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
@@ -192,13 +180,12 @@ define <8 x bfloat> @uniform_masked_load_ptr1_mask_v8bf16(ptr addrspace(1) inreg
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX942-NEXT:  ; %bb.1: ; %cond.load
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
 ; GFX942-NEXT:  .LBB7_2:
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index 4f73e8e9c1883..64c602f81cb23 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -5,11 +5,11 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
 
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900,GFX9-SDAG,GFX900-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900,GFX9-GISEL,GFX900-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX900,GFX900-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX900,GFX900-GISEL %s
 
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950,GFX9-SDAG,GFX950-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950,GFX9-GISEL,GFX950-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
 
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
@@ -62,21 +62,37 @@ define half @v_maximumnum_f16(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f16:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f16:
 ; GFX10-SDAG:       ; %bb.0:
@@ -211,11 +227,17 @@ define half @v_maximumnum_f16_nnan(half %x, half %y) {
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f16_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f16_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -283,12 +305,19 @@ define half @v_maximumnum_f16_1.0(half %x) {
 ; GFX8-NEXT:    v_max_f16_e32 v0, 1.0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f16_1.0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f16_e32 v0, 1.0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f16_1.0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-NEXT:    v_max_f16_e32 v0, 1.0, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f16_1.0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-NEXT:    v_max_f16_e32 v0, 1.0, v0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f16_1.0:
 ; GFX10:       ; %bb.0:
@@ -373,21 +402,37 @@ define float @v_maximumnum_f32(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f32:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f32:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f32:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -461,11 +506,17 @@ define float @v_maximumnum_f32_nnan(float %x, float %y) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f32_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f32_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f32_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f32_nnan:
 ; GFX10:       ; %bb.0:
@@ -525,21 +576,37 @@ define double @v_maximumnum_f64(double %x, double %y) {
 ; GFX8-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f64:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f64:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f64:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX900-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX900-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f64:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX900-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX900-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f64:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f64:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f64:
 ; GFX10-SDAG:       ; %bb.0:
@@ -617,11 +684,17 @@ define double @v_maximumnum_f64_nnan(double %x, double %y) {
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f64_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f64_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f64_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f64_nnan:
 ; GFX10:       ; %bb.0:
@@ -663,12 +736,19 @@ define float @v_maximumnum_f32_1.0(float %x) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, 1.0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f32_1.0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v0, 1.0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f32_1.0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_max_f32_e32 v0, 1.0, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f32_1.0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_max_f32_e32 v0, 1.0, v0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f32_1.0:
 ; GFX10:       ; %bb.0:
@@ -717,13 +797,21 @@ define float @v_maximumnum_f32_rhs_not_snan(float %x, float %y) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f32_rhs_not_snan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f32_rhs_not_snan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f32_rhs_not_snan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f32_rhs_not_snan:
 ; GFX10:       ; %bb.0:
@@ -774,13 +862,21 @@ define float @v_maximumnum_f32_lhs_not_snan(float %x, float %y) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f32_lhs_not_snan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f32_lhs_not_snan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f32_lhs_not_snan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f32_lhs_not_snan:
 ; GFX10:       ; %bb.0:
@@ -831,13 +927,21 @@ define float @v_maximumnum_f32_both_operands_not_snan(float %x, float %y) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f32_both_operands_not_snan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f32_both_operands_not_snan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f32_both_operands_not_snan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f32_both_operands_not_snan:
 ; GFX10:       ; %bb.0:
@@ -887,12 +991,19 @@ define double @v_maximumnum_f64_1.0(double %x) {
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], 1.0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f64_1.0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], 1.0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f64_1.0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], 1.0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f64_1.0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], 1.0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f64_1.0:
 ; GFX10:       ; %bb.0:
@@ -2190,21 +2301,37 @@ define float @v_maximumnum_f32_fabs_rhs(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f32_fabs_rhs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f32_fabs_rhs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f32_fabs_rhs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f32_fabs_rhs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2303,21 +2430,37 @@ define float @v_maximumnum_f32_fneg_fabs_rhs(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f32_fneg_fabs_rhs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2417,21 +2560,37 @@ define float @v_maximumnum_f32_fabs(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f32_fabs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f32_fabs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f32_fabs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f32_fabs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f32_fabs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f32_fabs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f32_fabs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2531,21 +2690,37 @@ define float @v_maximumnum_f32_fneg(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f32_fneg:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v1, -v1, -v1
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v0, -v0, -v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f32_fneg:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f32_fneg:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f32_fneg:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f32_fneg:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f32_fneg:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f32_fneg:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2648,21 +2823,37 @@ define half @v_maximumnum_f16_fabs_rhs(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f16_fabs_rhs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f16_fabs_rhs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f16_fabs_rhs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2808,21 +2999,37 @@ define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2969,21 +3176,37 @@ define half @v_maximumnum_f16_fabs(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f16_fabs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f16_fabs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f16_fabs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f16_fabs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f16_fabs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f16_fabs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f16_fabs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -3130,21 +3353,37 @@ define half @v_maximumnum_f16_fneg(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f16_fneg:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v1, -v1, -v1
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v0, -v0, -v0
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f16_fneg:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f16_fneg:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f16_fneg:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f16_fneg:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f16_fneg:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f16_fneg:
 ; GFX10-SDAG:       ; %bb.0:
@@ -3288,21 +3527,37 @@ define double @v_maximumnum_f64_fneg(double %x, double %y) {
 ; GFX8-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f64_fneg:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f64_fneg:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
-; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_f64_fneg:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX900-SDAG-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX900-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_f64_fneg:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX900-GISEL-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX900-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_f64_fneg:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_f64_fneg:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_f64_fneg:
 ; GFX10-SDAG:       ; %bb.0:
@@ -3564,11 +3819,17 @@ define <2 x half> @v_maximumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v2f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v2f16_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v2f16_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v2f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -3663,16 +3924,16 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_v3f16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_v3f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_maximumnum_v3f16:
 ; GFX900-GISEL:       ; %bb.0:
@@ -3685,6 +3946,17 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v2
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-SDAG-LABEL: v_maximumnum_v3f16:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX950-GISEL-LABEL: v_maximumnum_v3f16:
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3834,19 +4106,33 @@ define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_v3f16_nnan:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_v3f16_nnan:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_v3f16_nnan:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_v3f16_nnan:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v3f16_nnan:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-GISEL-LABEL: v_maximumnum_v3f16_nnan:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v3f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -4157,12 +4443,19 @@ define <4 x half> @v_maximumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v4f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v4f16_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v4f16_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v4f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -6691,27 +6984,49 @@ define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_v2f32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_v2f32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_v2f32:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_v2f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_v2f32:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_v2f32:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v2f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6797,12 +7112,19 @@ define <2 x float> @v_maximumnum_v2f32_nnan(<2 x float> %x, <2 x float> %y) {
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v2f32_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v2f32_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v2f32_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v2f32_nnan:
 ; GFX10:       ; %bb.0:
@@ -6887,33 +7209,61 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_v3f32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_v3f32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_v3f32:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_v3f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_v3f32:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_v3f32:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v3f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -7015,13 +7365,21 @@ define <3 x float> @v_maximumnum_v3f32_nnan(<3 x float> %x, <3 x float> %y) {
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v3f32_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v3f32_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX900-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v3f32_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX950-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v3f32_nnan:
 ; GFX10:       ; %bb.0:
@@ -7121,39 +7479,73 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_v4f32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_v4f32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_v4f32:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_v4f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_v4f32:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_v4f32:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v4f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -7267,14 +7659,23 @@ define <4 x float> @v_maximumnum_v4f32_nnan(<4 x float> %x, <4 x float> %y) {
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v4f32_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v4f32_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX900-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX900-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v4f32_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX950-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX950-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v4f32_nnan:
 ; GFX10:       ; %bb.0:
@@ -7376,12 +7777,12 @@ define <2 x double> @v_maximumnum_v2f64(<2 x double> %x, <2 x double> %y) {
 ; GFX950-SDAG-LABEL: v_maximumnum_v2f64:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: v_maximumnum_v2f64:
@@ -7491,12 +7892,26 @@ define <2 x double> @v_maximumnum_v2f64_nnan(<2 x double> %x, <2 x double> %y) {
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v2f64_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v2f64_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_v2f64_nnan:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_v2f64_nnan:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v2f64_nnan:
 ; GFX10:       ; %bb.0:
@@ -7614,15 +8029,15 @@ define <3 x double> @v_maximumnum_v3f64(<3 x double> %x, <3 x double> %y) {
 ; GFX950-SDAG-LABEL: v_maximumnum_v3f64:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
-; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[10:11], v[10:11]
-; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: v_maximumnum_v3f64:
@@ -7755,13 +8170,29 @@ define <3 x double> @v_maximumnum_v3f64_nnan(<3 x double> %x, <3 x double> %y) {
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v3f64_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
-; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v3f64_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX900-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_v3f64_nnan:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_v3f64_nnan:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v3f64_nnan:
 ; GFX10:       ; %bb.0:
@@ -7900,18 +8331,18 @@ define <4 x double> @v_maximumnum_v4f64(<4 x double> %x, <4 x double> %y) {
 ; GFX950-SDAG-LABEL: v_maximumnum_v4f64:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
+; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX950-SDAG-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-SDAG-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
-; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
-; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[12:13], v[12:13]
-; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[14:15], v[14:15]
-; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
-; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[8:9]
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: v_maximumnum_v4f64:
@@ -8067,14 +8498,32 @@ define <4 x double> @v_maximumnum_v4f64_nnan(<4 x double> %x, <4 x double> %y) {
 ; GFX8-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v4f64_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
-; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
-; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v4f64_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX900-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX900-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_v4f64_nnan:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_maximumnum_v4f64_nnan:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v4f64_nnan:
 ; GFX10:       ; %bb.0:
@@ -8136,11 +8585,17 @@ define half @v_maximumnum_f16_no_ieee(half %x, half %y) #0 {
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f16_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f16_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f16_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f16_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8241,11 +8696,17 @@ define half @v_maximumnum_f16_nan_no_ieee(half %x, half %y) #0 {
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f16_nan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f16_nan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f16_nan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f16_nan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8301,11 +8762,17 @@ define float @v_maximumnum_f32_no_ieee(float %x, float %y) #0 {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f32_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f32_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f32_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f32_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8359,11 +8826,17 @@ define float @v_maximumnum_f32_nnan_no_ieee(float %x, float %y) #0 {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f32_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f32_nnan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f32_nnan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f32_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8403,11 +8876,17 @@ define double @v_maximumnum_f64_no_ieee(double %x, double %y) #0 {
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f64_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f64_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f64_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f64_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8463,11 +8942,17 @@ define double @v_maximumnum_f64_nnan_no_ieee(double %x, double %y) #0 {
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_f64_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_f64_nnan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_f64_nnan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_f64_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8539,11 +9024,17 @@ define <2 x half> @v_maximumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v2f16_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v2f16_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v2f16_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8631,11 +9122,17 @@ define <2 x half> @v_maximumnum_v2f16_nnan_no_ieee(<2 x half> %x, <2 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v2f16_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v2f16_nnan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v2f16_nnan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v2f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8718,19 +9215,33 @@ define <3 x half> @v_maximumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-GISEL-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8829,12 +9340,19 @@ define <4 x half> @v_maximumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v4f16_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximumnum_v4f16_nnan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v4f16_nnan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v4f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8865,6 +9383,3 @@ define <4 x half> @v_maximumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y)
 }
 
 attributes #0 = { "amdgpu-ieee"="false" }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX900: {{.*}}
-; GFX950: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 6110b3101020a..d8454c900d23a 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -2498,39 +2498,40 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
 ;
 ; GFX942-LABEL: test_mfma_nested_loop_zeroinit:
 ; GFX942:       ; %bb.0: ; %entry
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v1
 ; GFX942-NEXT:    s_mov_b32 s0, 0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a8, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a9, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a10, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a11, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a12, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a13, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a14, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a15, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a16, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a17, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a18, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a19, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a20, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a21, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a22, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a23, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a24, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a25, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a26, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a27, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a28, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a29, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a30, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a31, a0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX942-NEXT:  .LBB9_1: ; %for.cond.preheader
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 558006d2b6957..9c4a1ca797110 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -5,11 +5,11 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
 
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900,GFX9-SDAG,GFX900-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900,GFX9-GISEL,GFX900-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX900,GFX900-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX900,GFX900-GISEL %s
 
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950,GFX9-SDAG,GFX950-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950,GFX9-GISEL,GFX950-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
 
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
@@ -62,21 +62,37 @@ define half @v_minimumnum_f16(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f16:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f16:
 ; GFX10-SDAG:       ; %bb.0:
@@ -211,11 +227,17 @@ define half @v_minimumnum_f16_nnan(half %x, half %y) {
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f16_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f16_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -283,12 +305,19 @@ define half @v_minimumnum_f16_1.0(half %x) {
 ; GFX8-NEXT:    v_min_f16_e32 v0, 1.0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f16_1.0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f16_e32 v0, 1.0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f16_1.0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-NEXT:    v_min_f16_e32 v0, 1.0, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f16_1.0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-NEXT:    v_min_f16_e32 v0, 1.0, v0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f16_1.0:
 ; GFX10:       ; %bb.0:
@@ -373,21 +402,37 @@ define float @v_minimumnum_f32(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f32:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f32:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f32:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -461,11 +506,17 @@ define float @v_minimumnum_f32_nnan(float %x, float %y) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f32_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f32_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f32_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f32_nnan:
 ; GFX10:       ; %bb.0:
@@ -525,21 +576,37 @@ define double @v_minimumnum_f64(double %x, double %y) {
 ; GFX8-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f64:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f64:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f64:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX900-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX900-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f64:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX900-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX900-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f64:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f64:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f64:
 ; GFX10-SDAG:       ; %bb.0:
@@ -617,11 +684,17 @@ define double @v_minimumnum_f64_nnan(double %x, double %y) {
 ; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f64_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f64_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f64_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f64_nnan:
 ; GFX10:       ; %bb.0:
@@ -663,12 +736,19 @@ define float @v_minimumnum_f32_1.0(float %x) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, 1.0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f32_1.0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f32_e32 v0, 1.0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f32_1.0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_min_f32_e32 v0, 1.0, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f32_1.0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_min_f32_e32 v0, 1.0, v0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f32_1.0:
 ; GFX10:       ; %bb.0:
@@ -717,13 +797,21 @@ define float @v_minimumnum_f32_rhs_not_snan(float %x, float %y) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f32_rhs_not_snan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f32_rhs_not_snan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f32_rhs_not_snan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f32_rhs_not_snan:
 ; GFX10:       ; %bb.0:
@@ -774,13 +862,21 @@ define float @v_minimumnum_f32_lhs_not_snan(float %x, float %y) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f32_lhs_not_snan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f32_lhs_not_snan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f32_lhs_not_snan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f32_lhs_not_snan:
 ; GFX10:       ; %bb.0:
@@ -831,13 +927,21 @@ define float @v_minimumnum_f32_both_operands_not_snan(float %x, float %y) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f32_both_operands_not_snan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f32_both_operands_not_snan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f32_both_operands_not_snan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f32_both_operands_not_snan:
 ; GFX10:       ; %bb.0:
@@ -887,12 +991,19 @@ define double @v_minimumnum_f64_1.0(double %x) {
 ; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], 1.0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f64_1.0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], 1.0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f64_1.0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], 1.0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f64_1.0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], 1.0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f64_1.0:
 ; GFX10:       ; %bb.0:
@@ -2015,21 +2126,37 @@ define float @v_minimumnum_f32_fabs_rhs(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f32_fabs_rhs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f32_fabs_rhs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f32_fabs_rhs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f32_fabs_rhs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2128,21 +2255,37 @@ define float @v_minimumnum_f32_fneg_fabs_rhs(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f32_fneg_fabs_rhs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2242,21 +2385,37 @@ define float @v_minimumnum_f32_fabs(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f32_fabs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f32_fabs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f32_fabs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f32_fabs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f32_fabs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f32_fabs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v0, |v0|, |v0|
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f32_fabs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2356,21 +2515,37 @@ define float @v_minimumnum_f32_fneg(float %x, float %y) {
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f32_fneg:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v1, -v1, -v1
-; GFX9-SDAG-NEXT:    v_max_f32_e64 v0, -v0, -v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f32_fneg:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
-; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f32_fneg:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX900-SDAG-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f32_fneg:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX900-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f32_fneg:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX950-SDAG-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f32_fneg:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
+; GFX950-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f32_fneg:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2473,21 +2648,37 @@ define half @v_minimumnum_f16_fabs_rhs(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f16_fabs_rhs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f16_fabs_rhs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX900-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX950-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f16_fabs_rhs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2633,21 +2824,37 @@ define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX9-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX900-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX900-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX950-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX950-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2794,21 +3001,37 @@ define half @v_minimumnum_f16_fabs(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f16_fabs:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
-; GFX9-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f16_fabs:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f16_fabs:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX900-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f16_fabs:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX900-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f16_fabs:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX950-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f16_fabs:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX950-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f16_fabs:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2955,21 +3178,37 @@ define half @v_minimumnum_f16_fneg(half %x, half %y) {
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f16_fneg:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v1, -v1, -v1
-; GFX9-SDAG-NEXT:    v_max_f16_e64 v0, -v0, -v0
-; GFX9-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f16_fneg:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
-; GFX9-GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
-; GFX9-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f16_fneg:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX900-SDAG-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX900-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f16_fneg:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX900-GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX900-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f16_fneg:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX950-SDAG-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX950-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f16_fneg:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX950-GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX950-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f16_fneg:
 ; GFX10-SDAG:       ; %bb.0:
@@ -3113,21 +3352,37 @@ define double @v_minimumnum_f64_fneg(double %x, double %y) {
 ; GFX8-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f64_fneg:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
-; GFX9-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f64_fneg:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
-; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
-; GFX9-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_f64_fneg:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX900-SDAG-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX900-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_f64_fneg:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX900-GISEL-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX900-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_f64_fneg:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX950-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_f64_fneg:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_f64_fneg:
 ; GFX10-SDAG:       ; %bb.0:
@@ -3389,11 +3644,17 @@ define <2 x half> @v_minimumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v2f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v2f16_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v2f16_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v2f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -3488,16 +3749,16 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_v3f16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_v3f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_minimumnum_v3f16:
 ; GFX900-GISEL:       ; %bb.0:
@@ -3510,6 +3771,17 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v2
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-SDAG-LABEL: v_minimumnum_v3f16:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX950-GISEL-LABEL: v_minimumnum_v3f16:
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3659,19 +3931,33 @@ define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_v3f16_nnan:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_v3f16_nnan:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_v3f16_nnan:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_v3f16_nnan:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v3f16_nnan:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-GISEL-LABEL: v_minimumnum_v3f16_nnan:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v3f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -3982,12 +4268,19 @@ define <4 x half> @v_minimumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v4f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v4f16_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v4f16_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v4f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -6516,27 +6809,49 @@ define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_v2f32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_v2f32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_v2f32:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_v2f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_v2f32:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_v2f32:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v2f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6622,12 +6937,19 @@ define <2 x float> @v_minimumnum_v2f32_nnan(<2 x float> %x, <2 x float> %y) {
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v2f32_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v2f32_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v2f32_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v2f32_nnan:
 ; GFX10:       ; %bb.0:
@@ -6712,33 +7034,61 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_v3f32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v2, v2, v3
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_v3f32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_v3f32:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_v3f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_v3f32:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_v3f32:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v3f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6840,13 +7190,21 @@ define <3 x float> @v_minimumnum_v3f32_nnan(<3 x float> %x, <3 x float> %y) {
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v3f32_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v3f32_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX900-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v3f32_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX950-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX950-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v3f32_nnan:
 ; GFX10:       ; %bb.0:
@@ -6946,39 +7304,73 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_v4f32:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_v4f32:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_v4f32:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX900-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX900-SDAG-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_v4f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX900-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_v4f32:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX950-SDAG-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX950-SDAG-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_v4f32:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX950-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v4f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -7092,14 +7484,23 @@ define <4 x float> @v_minimumnum_v4f32_nnan(<4 x float> %x, <4 x float> %y) {
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v4f32_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v4f32_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX900-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX900-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v4f32_nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX950-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX950-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX950-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v4f32_nnan:
 ; GFX10:       ; %bb.0:
@@ -7201,12 +7602,12 @@ define <2 x double> @v_minimumnum_v2f64(<2 x double> %x, <2 x double> %y) {
 ; GFX950-SDAG-LABEL: v_minimumnum_v2f64:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX950-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: v_minimumnum_v2f64:
@@ -7316,12 +7717,26 @@ define <2 x double> @v_minimumnum_v2f64_nnan(<2 x double> %x, <2 x double> %y) {
 ; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v2f64_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v2f64_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_v2f64_nnan:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_v2f64_nnan:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v2f64_nnan:
 ; GFX10:       ; %bb.0:
@@ -7439,15 +7854,15 @@ define <3 x double> @v_minimumnum_v3f64(<3 x double> %x, <3 x double> %y) {
 ; GFX950-SDAG-LABEL: v_minimumnum_v3f64:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
 ; GFX950-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
-; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[10:11], v[10:11]
-; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX950-SDAG-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: v_minimumnum_v3f64:
@@ -7580,13 +7995,29 @@ define <3 x double> @v_minimumnum_v3f64_nnan(<3 x double> %x, <3 x double> %y) {
 ; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v3f64_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
-; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v3f64_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX900-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_v3f64_nnan:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_v3f64_nnan:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v3f64_nnan:
 ; GFX10:       ; %bb.0:
@@ -7725,18 +8156,18 @@ define <4 x double> @v_minimumnum_v4f64(<4 x double> %x, <4 x double> %y) {
 ; GFX950-SDAG-LABEL: v_minimumnum_v4f64:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
+; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX950-SDAG-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-SDAG-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
 ; GFX950-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-SDAG-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
 ; GFX950-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
-; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
-; GFX950-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
-; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[12:13], v[12:13]
-; GFX950-SDAG-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX950-SDAG-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX950-SDAG-NEXT:    v_max_f64 v[8:9], v[14:15], v[14:15]
-; GFX950-SDAG-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
-; GFX950-SDAG-NEXT:    v_min_f64 v[6:7], v[6:7], v[8:9]
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: v_minimumnum_v4f64:
@@ -7892,14 +8323,32 @@ define <4 x double> @v_minimumnum_v4f64_nnan(<4 x double> %x, <4 x double> %y) {
 ; GFX8-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v4f64_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
-; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
-; GFX9-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v4f64_nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX900-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX900-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_v4f64_nnan:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-SDAG-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX950-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: v_minimumnum_v4f64_nnan:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
+; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
+; GFX950-GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
+; GFX950-GISEL-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v4f64_nnan:
 ; GFX10:       ; %bb.0:
@@ -7961,11 +8410,17 @@ define half @v_minimumnum_f16_no_ieee(half %x, half %y) #0 {
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f16_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f16_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f16_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f16_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8066,11 +8521,17 @@ define half @v_minimumnum_f16_nan_no_ieee(half %x, half %y) #0 {
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f16_nan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f16_nan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f16_nan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f16_nan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8126,11 +8587,17 @@ define float @v_minimumnum_f32_no_ieee(float %x, float %y) #0 {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f32_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f32_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f32_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f32_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8184,11 +8651,17 @@ define float @v_minimumnum_f32_nnan_no_ieee(float %x, float %y) #0 {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f32_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f32_nnan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f32_nnan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f32_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8228,11 +8701,17 @@ define double @v_minimumnum_f64_no_ieee(double %x, double %y) #0 {
 ; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f64_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f64_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f64_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f64_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8288,11 +8767,17 @@ define double @v_minimumnum_f64_nnan_no_ieee(double %x, double %y) #0 {
 ; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_f64_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_f64_nnan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_f64_nnan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_f64_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8364,11 +8849,17 @@ define <2 x half> @v_minimumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v2f16_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v2f16_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v2f16_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8456,11 +8947,17 @@ define <2 x half> @v_minimumnum_v2f16_nnan_no_ieee(<2 x half> %x, <2 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v2f16_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v2f16_nnan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v2f16_nnan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v2f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8543,19 +9040,33 @@ define <3 x half> @v_minimumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-SDAG-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-GISEL-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8654,12 +9165,19 @@ define <4 x half> @v_minimumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v4f16_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimumnum_v4f16_nnan_no_ieee:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v4f16_nnan_no_ieee:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v4f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
@@ -8690,6 +9208,3 @@ define <4 x half> @v_minimumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y)
 }
 
 attributes #0 = { "amdgpu-ieee"="false" }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX900: {{.*}}
-; GFX950: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 7e3d5c97391e1..57e1b87a01063 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -3232,29 +3232,28 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
 ; GFX1250-NEXT:    s_mov_b32 s2, s8
 ; GFX1250-NEXT:    s_and_b64 s[4:5], s[12:13], s[4:5]
 ; GFX1250-NEXT:    s_mov_b32 s6, s13
-; GFX1250-NEXT:    s_mul_u64 s[10:11], s[10:11], s[12:13]
-; GFX1250-NEXT:    s_mul_u64 s[12:13], s[4:5], s[2:3]
+; GFX1250-NEXT:    s_mul_u64 s[22:23], s[4:5], s[2:3]
+; GFX1250-NEXT:    s_mul_u64 s[24:25], s[6:7], s[2:3]
+; GFX1250-NEXT:    s_mov_b32 s2, s23
 ; GFX1250-NEXT:    s_mov_b32 s16, s9
 ; GFX1250-NEXT:    s_mul_u64 s[8:9], s[8:9], s[14:15]
-; GFX1250-NEXT:    s_mul_u64 s[14:15], s[6:7], s[2:3]
-; GFX1250-NEXT:    s_mov_b32 s2, s13
+; GFX1250-NEXT:    s_add_nc_u64 s[14:15], s[24:25], s[2:3]
 ; GFX1250-NEXT:    s_mul_u64 s[4:5], s[4:5], s[16:17]
-; GFX1250-NEXT:    s_add_nc_u64 s[14:15], s[14:15], s[2:3]
-; GFX1250-NEXT:    s_mul_u64 s[6:7], s[6:7], s[16:17]
 ; GFX1250-NEXT:    s_mov_b32 s2, s15
 ; GFX1250-NEXT:    s_mov_b32 s15, s3
-; GFX1250-NEXT:    s_mov_b32 s13, s3
+; GFX1250-NEXT:    s_mul_u64 s[10:11], s[10:11], s[12:13]
 ; GFX1250-NEXT:    s_add_nc_u64 s[4:5], s[4:5], s[14:15]
-; GFX1250-NEXT:    s_add_nc_u64 s[8:9], s[10:11], s[8:9]
+; GFX1250-NEXT:    s_mul_u64 s[6:7], s[6:7], s[16:17]
 ; GFX1250-NEXT:    s_mov_b32 s18, s5
-; GFX1250-NEXT:    s_mov_b32 s21, s4
+; GFX1250-NEXT:    s_mov_b32 s23, s3
 ; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[18:19]
-; GFX1250-NEXT:    s_or_b64 s[4:5], s[12:13], s[20:21]
+; GFX1250-NEXT:    s_mov_b32 s21, s4
 ; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[6:7], s[2:3]
-; GFX1250-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[8:9]
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1250-NEXT:    s_add_nc_u64 s[4:5], s[10:11], s[8:9]
+; GFX1250-NEXT:    s_or_b64 s[6:7], s[22:23], s[20:21]
+; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
index cf244f0b1f884..cba06c17b51ef 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -7,34 +7,32 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s4, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a1, 0
-; GFX942-NEXT:    s_mov_b32 s3, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
 ; GFX942-NEXT:    s_branch .LBB0_2
 ; GFX942-NEXT:  .LBB0_1: ; %bb2
 ; GFX942-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GFX942-NEXT:    s_or_b32 s4, s3, 1
-; GFX942-NEXT:    s_ashr_i32 s5, s3, 31
-; GFX942-NEXT:    s_mov_b32 s3, s2
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT:    s_nop 2
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a0, a2
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a1
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a1
-; GFX942-NEXT:    s_and_b32 s3, s5, s4
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3]
+; GFX942-NEXT:    s_or_b32 s2, s4, 1
+; GFX942-NEXT:    s_ashr_i32 s3, s4, 31
+; GFX942-NEXT:    v_mfma_f32_16x16x16_f16 a[2:5], v[0:1], v[0:1], a[0:3]
+; GFX942-NEXT:    s_and_b32 s4, s3, s2
 ; GFX942-NEXT:    s_cbranch_execz .LBB0_4
 ; GFX942-NEXT:  .LBB0_2: ; %bb
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_vccz .LBB0_1
 ; GFX942-NEXT:  ; %bb.3:
-; GFX942-NEXT:    ; implicit-def: $sgpr3
+; GFX942-NEXT:    ; implicit-def: $sgpr4
 ; GFX942-NEXT:    ; implicit-def: $agpr2
 ; GFX942-NEXT:  .LBB0_4: ; %common.ret
 ; GFX942-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 5b0d2d22c3e3d..afc8a9e4149ea 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -233,52 +233,52 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX900-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
 ; GFX900-NEXT:    s_endpgm
 ;
-; PACKED-SDAG-LABEL: fadd_v32_vs:
-; PACKED-SDAG:       ; %bb.0:
-; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[40:41]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[42:43]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[6:7], v[6:7], s[38:39]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(5)
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[8:9], v[8:9], s[48:49]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[10:11], v[10:11], s[50:51]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(4)
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[16:17], v[16:17], s[44:45]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[18:19], v[18:19], s[46:47]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[28:29], v[28:29], s[16:17]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[30:31], v[30:31], s[18:19]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[20:21], v[20:21], s[12:13]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[22:23], v[22:23], s[14:15]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[14:15], v[14:15], s[10:11]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[24:25], v[24:25], s[20:21]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[26:27], v[26:27], s[22:23]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[4:5], v[4:5], s[36:37]
-; PACKED-SDAG-NEXT:    v_pk_add_f32 v[12:13], v[12:13], s[8:9]
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
-; PACKED-SDAG-NEXT:    s_endpgm
+; GFX90A-SDAG-LABEL: fadd_v32_vs:
+; GFX90A-SDAG:       ; %bb.0:
+; GFX90A-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; GFX90A-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; GFX90A-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX90A-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[40:41]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[42:43]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[6:7], v[6:7], s[38:39]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[8:9], v[8:9], s[48:49]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[10:11], v[10:11], s[50:51]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[16:17], v[16:17], s[44:45]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[18:19], v[18:19], s[46:47]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[28:29], v[28:29], s[16:17]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[30:31], v[30:31], s[18:19]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[20:21], v[20:21], s[12:13]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[22:23], v[22:23], s[14:15]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[14:15], v[14:15], s[10:11]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[24:25], v[24:25], s[20:21]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[26:27], v[26:27], s[22:23]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[4:5], v[4:5], s[36:37]
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[12:13], v[12:13], s[8:9]
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX90A-SDAG-NEXT:    s_endpgm
 ;
 ; PACKED-GISEL-LABEL: fadd_v32_vs:
 ; PACKED-GISEL:       ; %bb.0:
@@ -330,58 +330,108 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
 ; PACKED-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: fadd_v32_vs:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v36, 7, v0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[0:3], v36, s[0:1] offset:112
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[12:15], v36, s[0:1] offset:96
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[24:27], v36, s[0:1] offset:80
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[28:31], v36, s[0:1] offset:64
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[32:35], v36, s[0:1] offset:48
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[16:19], v36, s[0:1] offset:32
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[8:11], v36, s[0:1] offset:16
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[4:7], v36, s[0:1]
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xe4
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xa4
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[50:51]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[22:23], v[14:15], s[46:47]
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[20:21], v[12:13], s[44:45]
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[48:49]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[14:15], v[26:27], s[42:43]
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[12:13], v[24:25], s[40:41]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[26:27], v[30:31], s[38:39]
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[24:25], v[28:29], s[36:37]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[30:31], v[34:35], s[22:23]
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[28:29], v[32:33], s[20:21]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[18:19], v[18:19], s[18:19]
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[16:17], v[16:17], s[16:17]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[10:11], v[10:11], s[14:15]
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[8:9], v[8:9], s[12:13]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[6:7], v[6:7], s[10:11]
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[4:5], v[4:5], s[8:9]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[20:23], s[0:1] offset:96
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[0:3], s[0:1] offset:112
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[24:27], s[0:1] offset:64
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[12:15], s[0:1] offset:80
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[28:31], s[0:1] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[4:7], s[0:1]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:16
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1250-SDAG-LABEL: fadd_v32_vs:
 ; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v36, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v32, s[0:1] offset:16
-; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v32, s[0:1]
-; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v32, s[0:1] offset:48
-; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v32, s[0:1] offset:32
-; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v32, s[0:1] offset:80
-; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v32, s[0:1] offset:64
-; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v32, s[0:1] offset:112
-; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v32, s[0:1] offset:96
-; GFX1250-SDAG-NEXT:    s_clause 0x1
-; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v36, s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v36, s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v36, s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v36, s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_load_b128 v[32:35], v36, s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v36, s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v36, s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v36, s[34:35]
+; GFX1250-SDAG-NEXT:    s_load_b512 s[16:31], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_load_b512 s[0:15], s[4:5], 0xa4
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[12:13]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[14:15]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[30:31]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[6:7], v[6:7], s[10:11]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[22:23], v[14:15], s[26:27]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[20:21], v[12:13], s[24:25]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[28:29]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x5
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[14:15], v[26:27], s[22:23]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[12:13], v[24:25], s[20:21]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x4
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[20:21], v[20:21], s[16:17]
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x3
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[12:13], v[12:13], s[40:41]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[26:27], v[30:31], s[18:19]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[24:25], v[28:29], s[16:17]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x2
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[18:19], v[18:19], s[38:39]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[18:19], v[18:19], s[10:11]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[16:17], v[16:17], s[8:9]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[30:31], v[34:35], s[14:15]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[28:29], v[32:33], s[12:13]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x1
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[24:25], v[24:25], s[48:49]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[10:11], v[10:11], s[6:7]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[8:9], v[8:9], s[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[28:29], v[28:29], s[44:45]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[30:31], v[30:31], s[46:47]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[26:27], v[26:27], s[50:51]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[16:17], v[16:17], s[36:37]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[14:15], v[14:15], s[42:43]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[22:23], v[22:23], s[18:19]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[8:9], v[8:9], s[20:21]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[10:11], v[10:11], s[22:23]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[4:5], v[4:5], s[8:9]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[6:7], v[6:7], s[2:3]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[4:5], v[4:5], s[0:1]
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:96
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:112
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:64
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:80
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:48
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[4:7], s[0:1]
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[20:23], s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[0:3], s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[24:27], s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[12:15], s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[16:19], s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[28:31], s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[4:7], s[34:35]
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[8:11], s[34:35] offset:16
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: fadd_v32_vs:
@@ -676,18 +726,43 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
 ; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX900-NEXT:    s_endpgm
 ;
-; PACKED-LABEL: fadd_v2_v_lit_hi0:
-; PACKED:       ; %bb.0:
-; PACKED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; PACKED-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; PACKED-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; PACKED-NEXT:    s_mov_b64 s[2:3], 0x3f800000
-; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
-; PACKED-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
-; PACKED-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[2:3]
-; PACKED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; PACKED-NEXT:    s_endpgm
+; GFX90A-SDAG-LABEL: fadd_v2_v_lit_hi0:
+; GFX90A-SDAG:       ; %bb.0:
+; GFX90A-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX90A-SDAG-NEXT:    s_mov_b64 s[2:3], 0x3f800000
+; GFX90A-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-SDAG-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; GFX90A-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90A-SDAG-NEXT:    s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v2_v_lit_hi0:
+; PACKED-GISEL:       ; %bb.0:
+; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT:    s_mov_b64 s[2:3], 0x3f800000
+; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; PACKED-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT:    s_endpgm
+;
+; GFX942-SDAG-LABEL: fadd_v2_v_lit_hi0:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], 1.0
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-SDAG-LABEL: fadd_v2_v_lit_hi0:
 ; GFX1250-SDAG:       ; %bb.0:
@@ -1417,52 +1492,52 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX900-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
 ; GFX900-NEXT:    s_endpgm
 ;
-; PACKED-SDAG-LABEL: fmul_v32_vs:
-; PACKED-SDAG:       ; %bb.0:
-; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[40:41]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], s[42:43]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], s[38:39]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(5)
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], s[48:49]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], s[50:51]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(4)
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], s[44:45]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], s[46:47]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], s[16:17]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], s[18:19]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[20:21], v[20:21], s[12:13]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[22:23], v[22:23], s[14:15]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], s[10:11]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[24:25], v[24:25], s[20:21]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[26:27], v[26:27], s[22:23]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], s[36:37]
-; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[12:13], v[12:13], s[8:9]
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
-; PACKED-SDAG-NEXT:    s_endpgm
+; GFX90A-SDAG-LABEL: fmul_v32_vs:
+; GFX90A-SDAG:       ; %bb.0:
+; GFX90A-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; GFX90A-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; GFX90A-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX90A-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[40:41]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], s[42:43]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], s[38:39]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], s[48:49]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], s[50:51]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], s[44:45]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], s[46:47]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], s[16:17]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], s[18:19]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[20:21], v[20:21], s[12:13]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[22:23], v[22:23], s[14:15]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], s[10:11]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[24:25], v[24:25], s[20:21]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[26:27], v[26:27], s[22:23]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], s[36:37]
+; GFX90A-SDAG-NEXT:    v_pk_mul_f32 v[12:13], v[12:13], s[8:9]
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX90A-SDAG-NEXT:    s_endpgm
 ;
 ; PACKED-GISEL-LABEL: fmul_v32_vs:
 ; PACKED-GISEL:       ; %bb.0:
@@ -1514,58 +1589,108 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
 ; PACKED-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: fmul_v32_vs:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v36, 7, v0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[0:3], v36, s[0:1] offset:112
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[12:15], v36, s[0:1] offset:96
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[24:27], v36, s[0:1] offset:80
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[28:31], v36, s[0:1] offset:64
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[32:35], v36, s[0:1] offset:48
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[16:19], v36, s[0:1] offset:32
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[8:11], v36, s[0:1] offset:16
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[4:7], v36, s[0:1]
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xe4
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xa4
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], s[50:51]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[22:23], v[14:15], s[46:47]
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[20:21], v[12:13], s[44:45]
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[48:49]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[14:15], v[26:27], s[42:43]
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[12:13], v[24:25], s[40:41]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[26:27], v[30:31], s[38:39]
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[24:25], v[28:29], s[36:37]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[30:31], v[34:35], s[22:23]
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[28:29], v[32:33], s[20:21]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], s[18:19]
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], s[16:17]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], s[14:15]
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], s[12:13]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], s[10:11]
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], s[8:9]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[20:23], s[0:1] offset:96
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[0:3], s[0:1] offset:112
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[24:27], s[0:1] offset:64
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[12:15], s[0:1] offset:80
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[28:31], s[0:1] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[4:7], s[0:1]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:16
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1250-SDAG-LABEL: fmul_v32_vs:
 ; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v36, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v32, s[0:1] offset:16
-; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v32, s[0:1]
-; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v32, s[0:1] offset:48
-; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v32, s[0:1] offset:32
-; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v32, s[0:1] offset:80
-; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v32, s[0:1] offset:64
-; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v32, s[0:1] offset:112
-; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v32, s[0:1] offset:96
-; GFX1250-SDAG-NEXT:    s_clause 0x1
-; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v36, s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v36, s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v36, s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v36, s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_load_b128 v[32:35], v36, s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v36, s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v36, s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v36, s[34:35]
+; GFX1250-SDAG-NEXT:    s_load_b512 s[16:31], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_load_b512 s[0:15], s[4:5], 0xa4
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[12:13]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], s[14:15]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], s[30:31]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], s[10:11]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[22:23], v[14:15], s[26:27]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[20:21], v[12:13], s[24:25]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[28:29]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x5
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[14:15], v[26:27], s[22:23]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[12:13], v[24:25], s[20:21]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x4
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[20:21], v[20:21], s[16:17]
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x3
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[12:13], v[12:13], s[40:41]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[26:27], v[30:31], s[18:19]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[24:25], v[28:29], s[16:17]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x2
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], s[38:39]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], s[10:11]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], s[8:9]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[30:31], v[34:35], s[14:15]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[28:29], v[32:33], s[12:13]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x1
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[24:25], v[24:25], s[48:49]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], s[6:7]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], s[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], s[44:45]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], s[46:47]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[26:27], v[26:27], s[50:51]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], s[36:37]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], s[42:43]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[22:23], v[22:23], s[18:19]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], s[20:21]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], s[22:23]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], s[8:9]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], s[2:3]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], s[0:1]
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:96
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:112
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:64
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:80
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:48
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[4:7], s[0:1]
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[20:23], s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[0:3], s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[24:27], s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[12:15], s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[16:19], s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[28:31], s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[4:7], s[34:35]
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[8:11], s[34:35] offset:16
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: fmul_v32_vs:
@@ -2195,52 +2320,52 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX900-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
 ; GFX900-NEXT:    s_endpgm
 ;
-; PACKED-SDAG-LABEL: fma_v32_vs:
-; PACKED-SDAG:       ; %bb.0:
-; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
-; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[6:7], v[6:7], s[38:39], s[38:39]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(5)
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[8:9], v[8:9], s[48:49], s[48:49]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[10:11], v[10:11], s[50:51], s[50:51]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(4)
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[16:17], v[16:17], s[44:45], s[44:45]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[18:19], v[18:19], s[46:47], s[46:47]
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[28:29], v[28:29], s[16:17], s[16:17]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[30:31], v[30:31], s[18:19], s[18:19]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[20:21], v[20:21], s[12:13], s[12:13]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[22:23], v[22:23], s[14:15], s[14:15]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[14:15], v[14:15], s[10:11], s[10:11]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[24:25], v[24:25], s[20:21], s[20:21]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[26:27], v[26:27], s[22:23], s[22:23]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[4:5], v[4:5], s[36:37], s[36:37]
-; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[12:13], v[12:13], s[8:9], s[8:9]
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
-; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
-; PACKED-SDAG-NEXT:    s_endpgm
+; GFX90A-SDAG-LABEL: fma_v32_vs:
+; GFX90A-SDAG:       ; %bb.0:
+; GFX90A-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; GFX90A-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; GFX90A-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; GFX90A-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX90A-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[6:7], v[6:7], s[38:39], s[38:39]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[8:9], v[8:9], s[48:49], s[48:49]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[10:11], v[10:11], s[50:51], s[50:51]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[16:17], v[16:17], s[44:45], s[44:45]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[18:19], v[18:19], s[46:47], s[46:47]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[28:29], v[28:29], s[16:17], s[16:17]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[30:31], v[30:31], s[18:19], s[18:19]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[20:21], v[20:21], s[12:13], s[12:13]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[22:23], v[22:23], s[14:15], s[14:15]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[14:15], v[14:15], s[10:11], s[10:11]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[24:25], v[24:25], s[20:21], s[20:21]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[26:27], v[26:27], s[22:23], s[22:23]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[4:5], v[4:5], s[36:37], s[36:37]
+; GFX90A-SDAG-NEXT:    v_pk_fma_f32 v[12:13], v[12:13], s[8:9], s[8:9]
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
+; GFX90A-SDAG-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX90A-SDAG-NEXT:    s_endpgm
 ;
 ; PACKED-GISEL-LABEL: fma_v32_vs:
 ; PACKED-GISEL:       ; %bb.0:
@@ -2292,58 +2417,108 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
 ; PACKED-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: fma_v32_vs:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v24, 7, v0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:96
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[26:29], v24, s[0:1] offset:80
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[30:33], v24, s[0:1] offset:64
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[34:37], v24, s[0:1] offset:48
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:16
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1]
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xe4
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xa4
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], s[50:51], s[50:51]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[18:19], v[14:15], s[46:47], s[46:47]
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[16:17], v[12:13], s[44:45], s[44:45]
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[48:49], s[48:49]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[14:15], v[28:29], s[42:43], s[42:43]
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[12:13], v[26:27], s[40:41], s[40:41]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[28:29], v[32:33], s[38:39], s[38:39]
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[26:27], v[30:31], s[36:37], s[36:37]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[32:33], v[36:37], s[22:23], s[22:23]
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[30:31], v[34:35], s[20:21], s[20:21]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[22:23], v[22:23], s[18:19], s[18:19]
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[20:21], v[20:21], s[16:17], s[16:17]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[10:11], v[10:11], s[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[8:9], v[8:9], s[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[6:7], v[6:7], s[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_pk_fma_f32 v[4:5], v[4:5], s[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v24, v[16:19], s[0:1] offset:96
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v24, v[0:3], s[0:1] offset:112
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v24, v[26:29], s[0:1] offset:64
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v24, v[12:15], s[0:1] offset:80
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v24, v[20:23], s[0:1] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v24, v[30:33], s[0:1] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v24, v[4:7], s[0:1]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v24, v[8:11], s[0:1] offset:16
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1250-SDAG-LABEL: fma_v32_vs:
 ; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v36, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v32, s[0:1] offset:16
-; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v32, s[0:1]
-; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v32, s[0:1] offset:48
-; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v32, s[0:1] offset:32
-; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v32, s[0:1] offset:80
-; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v32, s[0:1] offset:64
-; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v32, s[0:1] offset:112
-; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v32, s[0:1] offset:96
-; GFX1250-SDAG-NEXT:    s_clause 0x1
-; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v36, s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v36, s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v36, s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v36, s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_load_b128 v[32:35], v36, s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v36, s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v36, s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v36, s[34:35]
+; GFX1250-SDAG-NEXT:    s_load_b512 s[16:31], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_load_b512 s[0:15], s[4:5], 0xa4
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[12:13], s[12:13]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], s[14:15], s[14:15]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], s[30:31], s[30:31]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[6:7], v[6:7], s[10:11], s[10:11]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[22:23], v[14:15], s[26:27], s[26:27]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[20:21], v[12:13], s[24:25], s[24:25]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[28:29], s[28:29]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x5
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[14:15], v[26:27], s[22:23], s[22:23]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[12:13], v[24:25], s[20:21], s[20:21]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x4
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[20:21], v[20:21], s[16:17], s[16:17]
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x3
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[12:13], v[12:13], s[40:41], s[40:41]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[26:27], v[30:31], s[18:19], s[18:19]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[24:25], v[28:29], s[16:17], s[16:17]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x2
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[18:19], v[18:19], s[38:39], s[38:39]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[18:19], v[18:19], s[10:11], s[10:11]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[16:17], v[16:17], s[8:9], s[8:9]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[30:31], v[34:35], s[14:15], s[14:15]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[28:29], v[32:33], s[12:13], s[12:13]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x1
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[24:25], v[24:25], s[48:49], s[48:49]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[10:11], v[10:11], s[6:7], s[6:7]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[8:9], v[8:9], s[4:5], s[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[28:29], v[28:29], s[44:45], s[44:45]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[30:31], v[30:31], s[46:47], s[46:47]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[26:27], v[26:27], s[50:51], s[50:51]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[16:17], v[16:17], s[36:37], s[36:37]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[14:15], v[14:15], s[42:43], s[42:43]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[22:23], v[22:23], s[18:19], s[18:19]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[8:9], v[8:9], s[20:21], s[20:21]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[10:11], v[10:11], s[22:23], s[22:23]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[4:5], v[4:5], s[8:9], s[8:9]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[6:7], v[6:7], s[2:3], s[2:3]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[4:5], v[4:5], s[0:1], s[0:1]
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:96
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:112
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:64
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:80
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:48
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[4:7], s[0:1]
-; GFX1250-SDAG-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[20:23], s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[0:3], s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[24:27], s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[12:15], s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[16:19], s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[28:31], s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[4:7], s[34:35]
+; GFX1250-SDAG-NEXT:    global_store_b128 v36, v[8:11], s[34:35] offset:16
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: fma_v32_vs:
@@ -3274,7 +3449,7 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
 ; GFX1250-SDAG-NEXT:    s_add_f32 s1, s1, 0
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX1250-SDAG-NEXT:    flat_store_b64 v[0:1], v[0:1]
+; GFX1250-SDAG-NEXT:    flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: fadd_fadd_fsub_0:
@@ -3287,7 +3462,7 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
 ; GFX1250-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], 0
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v3, v0
-; GFX1250-GISEL-NEXT:    flat_store_b64 v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX1250-GISEL-NEXT:    s_endpgm
 bb:
   %i12 = fadd <2 x float> zeroinitializer, %arg
@@ -3611,6 +3786,3 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX90A-SDAG: {{.*}}
-; GFX942-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index efe4cfa7e5d2b..8bcb6642f2e0e 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -476,10 +476,10 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %o
 ; GFX942-NEXT:    .p2align 8
 ; GFX942-NEXT:  ; %bb.2:
 ; GFX942-NEXT:  .LBB12_0:
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX942-NEXT:    v_mov_b32_e32 v2, s8
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
 ; GFX942-NEXT:    s_endpgm
 ;
@@ -990,10 +990,8 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out,
 ; GFX942-NEXT:  ; %bb.2:
 ; GFX942-NEXT:  .LBB25_0:
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, s6
-; GFX942-NEXT:    v_mov_b32_e32 v1, s7
-; GFX942-NEXT:    v_mov_b32_e32 v2, s8
-; GFX942-NEXT:    v_mov_b32_e32 v3, s9
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX942-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll
index 192bd2073886a..5a7f7fb00f04e 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll
@@ -18,12 +18,11 @@ define amdgpu_kernel void @copy_to_vreg_1(i32 %0) {
 ; GCN-NEXT:    s_addc_u32 s0, 1, 0
 ; GCN-NEXT:    v_readfirstlane_b32 s2, v1
 ; GCN-NEXT:    s_cmp_ge_u32 s3, s4
-; GCN-NEXT:    s_cselect_b32 s4, s0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_cmp_lg_u64 0, 0
+; GCN-NEXT:    s_cselect_b32 s4, s0, s2
 ; GCN-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    s_mov_b64 s[2:3], 0
 ; GCN-NEXT:    s_branch .LBB0_3
 ; GCN-NEXT:  .LBB0_1: ; %Flow
 ; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
@@ -33,9 +32,9 @@ define amdgpu_kernel void @copy_to_vreg_1(i32 %0) {
 ; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_and_b64 s[4:5], exec, s[8:9]
-; GCN-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    s_mov_b32 s4, 0
-; GCN-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN-NEXT:    s_cbranch_execz .LBB0_8
 ; GCN-NEXT:  .LBB0_3: ; %.lr.ph27
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -57,7 +56,7 @@ define amdgpu_kernel void @copy_to_vreg_1(i32 %0) {
 ; GCN-NEXT:    s_cbranch_execz .LBB0_2
 ; GCN-NEXT:  ; %bb.6: ; %pred.store.continue
 ; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[2:3]
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[0:1]
 ; GCN-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
 ; GCN-NEXT:    s_cbranch_execz .LBB0_1
 ; GCN-NEXT:  ; %bb.7: ; %pred.store.if41
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll
index 51dc9a51ec9d0..3fd7f1cb481a4 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll
@@ -88,8 +88,7 @@ define void @v_shuffle_v2i64_v2i64__1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -144,8 +143,7 @@ define void @v_shuffle_v2i64_v2i64__3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -204,10 +202,8 @@ define void @v_shuffle_v2i64_v2i64__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -262,8 +258,7 @@ define void @v_shuffle_v2i64_v2i64__3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -309,8 +304,7 @@ define void @v_shuffle_v2i64_v2i64__3_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -355,8 +349,7 @@ define void @v_shuffle_v2i64_v2i64__3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -401,8 +394,7 @@ define void @v_shuffle_v2i64_v2i64__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -446,8 +438,7 @@ define void @v_shuffle_v2i64_v2i64__0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -492,8 +483,7 @@ define void @v_shuffle_v2i64_v2i64__1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -537,8 +527,7 @@ define void @v_shuffle_v2i64_v2i64__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -660,8 +649,7 @@ define void @v_shuffle_v2i64_v2i64__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -794,8 +782,7 @@ define void @v_shuffle_v2i64_v2i64__1_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -900,8 +887,7 @@ define void @v_shuffle_v2i64_v2i64__0_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -956,8 +942,7 @@ define void @v_shuffle_v2i64_v2i64__1_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1095,8 +1080,7 @@ define void @s_shuffle_v2i64_v2i64__1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1154,8 +1138,7 @@ define void @s_shuffle_v2i64_v2i64__3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1213,10 +1196,8 @@ define void @s_shuffle_v2i64_v2i64__3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1270,8 +1251,7 @@ define void @s_shuffle_v2i64_v2i64__3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1320,10 +1300,8 @@ define void @s_shuffle_v2i64_v2i64__3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1336,18 +1314,43 @@ define void @s_shuffle_v2i64_v2i64__3_2() {
 }
 
 define void @s_shuffle_v2i64_v2i64__3_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v2i64__3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v2i64__3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v2i64__3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v2i64__3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> <i32 3, i32 3>
@@ -1388,8 +1391,7 @@ define void @s_shuffle_v2i64_v2i64__u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1401,18 +1403,43 @@ define void @s_shuffle_v2i64_v2i64__u_0() {
 }
 
 define void @s_shuffle_v2i64_v2i64__0_0() {
-; GFX9-LABEL: s_shuffle_v2i64_v2i64__0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v2i64__0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v2i64__0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v2i64__0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -1456,10 +1483,8 @@ define void @s_shuffle_v2i64_v2i64__1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1503,8 +1528,7 @@ define void @s_shuffle_v2i64_v2i64__2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1596,18 +1620,43 @@ define void @s_shuffle_v2i64_v2i64__0_1() {
 }
 
 define void @s_shuffle_v2i64_v2i64__1_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v2i64__1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v2i64__1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v2i64__1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v2i64__1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -1741,8 +1790,7 @@ define void @s_shuffle_v2i64_v2i64__1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1850,8 +1898,7 @@ define void @s_shuffle_v2i64_v2i64__0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1905,8 +1952,7 @@ define void @s_shuffle_v2i64_v2i64__1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll
index bc8a56a30d8f9..f54d45b1367cc 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll
@@ -127,8 +127,7 @@ define void @v_shuffle_v2i64_v3i64__2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -223,8 +222,7 @@ define void @v_shuffle_v2i64_v3i64__5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -283,10 +281,8 @@ define void @v_shuffle_v2i64_v3i64__5_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -341,8 +337,7 @@ define void @v_shuffle_v2i64_v3i64__5_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v8
-; GFX942-NEXT:    v_mov_b32_e32 v1, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -397,8 +392,7 @@ define void @v_shuffle_v2i64_v3i64__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v10
-; GFX942-NEXT:    v_mov_b32_e32 v3, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -447,10 +441,8 @@ define void @v_shuffle_v2i64_v3i64__5_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -495,8 +487,7 @@ define void @v_shuffle_v2i64_v3i64__5_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -541,8 +532,7 @@ define void @v_shuffle_v2i64_v3i64__5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -587,8 +577,7 @@ define void @v_shuffle_v2i64_v3i64__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -632,8 +621,7 @@ define void @v_shuffle_v2i64_v3i64__0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -677,8 +665,7 @@ define void @v_shuffle_v2i64_v3i64__1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -726,10 +713,8 @@ define void @v_shuffle_v2i64_v3i64__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -773,8 +758,7 @@ define void @v_shuffle_v2i64_v3i64__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -828,8 +812,7 @@ define void @v_shuffle_v2i64_v3i64__4_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -952,8 +935,7 @@ define void @v_shuffle_v2i64_v3i64__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -997,8 +979,7 @@ define void @v_shuffle_v2i64_v3i64__2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1091,8 +1072,7 @@ define void @v_shuffle_v2i64_v3i64__4_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v2
-; GFX942-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1176,8 +1156,7 @@ define void @v_shuffle_v2i64_v3i64__0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1260,8 +1239,7 @@ define void @v_shuffle_v2i64_v3i64__2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1354,8 +1332,7 @@ define void @v_shuffle_v2i64_v3i64__4_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v4
-; GFX942-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1489,8 +1466,7 @@ define void @v_shuffle_v2i64_v3i64__2_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1545,8 +1521,7 @@ define void @v_shuffle_v2i64_v3i64__4_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1641,8 +1616,7 @@ define void @v_shuffle_v2i64_v3i64__0_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1697,8 +1671,7 @@ define void @v_shuffle_v2i64_v3i64__1_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1753,8 +1726,7 @@ define void @v_shuffle_v2i64_v3i64__2_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v4
-; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1839,8 +1811,7 @@ define void @v_shuffle_v2i64_v3i64__4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1935,8 +1906,7 @@ define void @v_shuffle_v2i64_v3i64__0_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1991,8 +1961,7 @@ define void @v_shuffle_v2i64_v3i64__1_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v8
-; GFX942-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2047,8 +2016,7 @@ define void @v_shuffle_v2i64_v3i64__2_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v4
-; GFX942-NEXT:    v_mov_b32_e32 v9, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2093,8 +2061,7 @@ define void @v_shuffle_v2i64_v3i64__3_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2232,8 +2199,7 @@ define void @s_shuffle_v2i64_v3i64__1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2273,8 +2239,7 @@ define void @s_shuffle_v2i64_v3i64__2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2332,8 +2297,7 @@ define void @s_shuffle_v2i64_v3i64__4_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2374,8 +2338,7 @@ define void @s_shuffle_v2i64_v3i64__5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2430,11 +2393,11 @@ define void @s_shuffle_v2i64_v3i64__5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2484,8 +2447,7 @@ define void @s_shuffle_v2i64_v3i64__5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2539,10 +2501,8 @@ define void @s_shuffle_v2i64_v3i64__5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2587,10 +2547,8 @@ define void @s_shuffle_v2i64_v3i64__5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2603,18 +2561,43 @@ define void @s_shuffle_v2i64_v3i64__5_3() {
 }
 
 define void @s_shuffle_v2i64_v3i64__5_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v3i64__5_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v3i64__5_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v3i64__5_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v3i64__5_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> <i32 5, i32 4>
@@ -2659,10 +2642,8 @@ define void @s_shuffle_v2i64_v3i64__5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2707,8 +2688,7 @@ define void @s_shuffle_v2i64_v3i64__u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2720,18 +2700,43 @@ define void @s_shuffle_v2i64_v3i64__u_0() {
 }
 
 define void @s_shuffle_v2i64_v3i64__0_0() {
-; GFX9-LABEL: s_shuffle_v2i64_v3i64__0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v3i64__0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v3i64__0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v3i64__0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -2775,10 +2780,8 @@ define void @s_shuffle_v2i64_v3i64__1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2822,10 +2825,8 @@ define void @s_shuffle_v2i64_v3i64__2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2869,8 +2870,7 @@ define void @s_shuffle_v2i64_v3i64__3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2924,13 +2924,11 @@ define void @s_shuffle_v2i64_v3i64__4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3023,18 +3021,43 @@ define void @s_shuffle_v2i64_v3i64__0_1() {
 }
 
 define void @s_shuffle_v2i64_v3i64__1_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v3i64__1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v3i64__1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v3i64__1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v3i64__1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> <i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -3042,18 +3065,43 @@ define void @s_shuffle_v2i64_v3i64__1_1() {
 }
 
 define void @s_shuffle_v2i64_v3i64__2_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v3i64__2_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v3i64__2_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v3i64__2_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v3i64__2_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> <i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -3142,8 +3190,7 @@ define void @s_shuffle_v2i64_v3i64__4_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3188,8 +3235,7 @@ define void @s_shuffle_v2i64_v3i64__u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3201,18 +3247,43 @@ define void @s_shuffle_v2i64_v3i64__u_2() {
 }
 
 define void @s_shuffle_v2i64_v3i64__0_2() {
-; GFX9-LABEL: s_shuffle_v2i64_v3i64__0_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v3i64__0_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v3i64__0_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v3i64__0_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> <i32 0, i32 2>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -3256,10 +3327,8 @@ define void @s_shuffle_v2i64_v3i64__1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3307,10 +3376,8 @@ define void @s_shuffle_v2i64_v3i64__2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3354,8 +3421,7 @@ define void @s_shuffle_v2i64_v3i64__3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3412,10 +3478,8 @@ define void @s_shuffle_v2i64_v3i64__4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3514,8 +3578,7 @@ define void @s_shuffle_v2i64_v3i64__1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3555,8 +3618,7 @@ define void @s_shuffle_v2i64_v3i64__2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3618,10 +3680,8 @@ define void @s_shuffle_v2i64_v3i64__4_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3716,8 +3776,7 @@ define void @s_shuffle_v2i64_v3i64__0_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3771,8 +3830,7 @@ define void @s_shuffle_v2i64_v3i64__1_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3826,8 +3884,7 @@ define void @s_shuffle_v2i64_v3i64__2_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3881,18 +3938,43 @@ define void @s_shuffle_v2i64_v3i64__3_4() {
 }
 
 define void @s_shuffle_v2i64_v3i64__4_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v3i64__4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v3i64__4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v3i64__4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v3i64__4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> <i32 4, i32 4>
@@ -3933,8 +4015,7 @@ define void @s_shuffle_v2i64_v3i64__u_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3988,8 +4069,7 @@ define void @s_shuffle_v2i64_v3i64__0_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4047,10 +4127,8 @@ define void @s_shuffle_v2i64_v3i64__1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4104,10 +4182,8 @@ define void @s_shuffle_v2i64_v3i64__2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4120,18 +4196,43 @@ define void @s_shuffle_v2i64_v3i64__2_5() {
 }
 
 define void @s_shuffle_v2i64_v3i64__3_5() {
-; GFX9-LABEL: s_shuffle_v2i64_v3i64__3_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v3i64__3_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v3i64__3_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v3i64__3_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> <i32 3, i32 5>
@@ -4176,10 +4277,8 @@ define void @s_shuffle_v2i64_v3i64__4_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll
index dd42a1dd44320..1c738b8f4f1d9 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll
@@ -166,8 +166,7 @@ define void @v_shuffle_v2i64_v4i64__3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -302,8 +301,7 @@ define void @v_shuffle_v2i64_v4i64__7_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -362,10 +360,8 @@ define void @v_shuffle_v2i64_v4i64__7_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v8
-; GFX942-NEXT:    v_mov_b32_e32 v3, v9
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -420,8 +416,7 @@ define void @v_shuffle_v2i64_v4i64__7_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v10
-; GFX942-NEXT:    v_mov_b32_e32 v1, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -476,8 +471,7 @@ define void @v_shuffle_v2i64_v4i64__7_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v12
-; GFX942-NEXT:    v_mov_b32_e32 v3, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -532,8 +526,7 @@ define void @v_shuffle_v2i64_v4i64__7_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v14
-; GFX942-NEXT:    v_mov_b32_e32 v5, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -582,10 +575,8 @@ define void @v_shuffle_v2i64_v4i64__7_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -630,8 +621,7 @@ define void @v_shuffle_v2i64_v4i64__7_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -676,8 +666,7 @@ define void @v_shuffle_v2i64_v4i64__7_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -722,8 +711,7 @@ define void @v_shuffle_v2i64_v4i64__7_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -768,8 +756,7 @@ define void @v_shuffle_v2i64_v4i64__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -813,8 +800,7 @@ define void @v_shuffle_v2i64_v4i64__0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -858,8 +844,7 @@ define void @v_shuffle_v2i64_v4i64__1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -903,8 +888,7 @@ define void @v_shuffle_v2i64_v4i64__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -952,10 +936,8 @@ define void @v_shuffle_v2i64_v4i64__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -999,8 +981,7 @@ define void @v_shuffle_v2i64_v4i64__4_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1054,8 +1035,7 @@ define void @v_shuffle_v2i64_v4i64__5_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1110,8 +1090,7 @@ define void @v_shuffle_v2i64_v4i64__6_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v0
-; GFX942-NEXT:    v_mov_b32_e32 v9, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1234,8 +1213,7 @@ define void @v_shuffle_v2i64_v4i64__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1279,8 +1257,7 @@ define void @v_shuffle_v2i64_v4i64__2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v2
-; GFX942-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1324,8 +1301,7 @@ define void @v_shuffle_v2i64_v4i64__3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1418,8 +1394,7 @@ define void @v_shuffle_v2i64_v4i64__5_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v2
-; GFX942-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1474,8 +1449,7 @@ define void @v_shuffle_v2i64_v4i64__6_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v2
-; GFX942-NEXT:    v_mov_b32_e32 v11, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1559,8 +1533,7 @@ define void @v_shuffle_v2i64_v4i64__0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1643,8 +1616,7 @@ define void @v_shuffle_v2i64_v4i64__2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v4
-; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1688,8 +1660,7 @@ define void @v_shuffle_v2i64_v4i64__3_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1782,8 +1753,7 @@ define void @v_shuffle_v2i64_v4i64__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v4
-; GFX942-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1838,8 +1808,7 @@ define void @v_shuffle_v2i64_v4i64__6_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v4
-; GFX942-NEXT:    v_mov_b32_e32 v13, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1923,8 +1892,7 @@ define void @v_shuffle_v2i64_v4i64__0_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1968,8 +1936,7 @@ define void @v_shuffle_v2i64_v4i64__1_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2052,8 +2019,7 @@ define void @v_shuffle_v2i64_v4i64__3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2146,8 +2112,7 @@ define void @v_shuffle_v2i64_v4i64__5_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v6
-; GFX942-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2202,8 +2167,7 @@ define void @v_shuffle_v2i64_v4i64__6_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v6
-; GFX942-NEXT:    v_mov_b32_e32 v15, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2376,8 +2340,7 @@ define void @v_shuffle_v2i64_v4i64__3_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2432,8 +2395,7 @@ define void @v_shuffle_v2i64_v4i64__5_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2478,8 +2440,7 @@ define void @v_shuffle_v2i64_v4i64__6_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2574,8 +2535,7 @@ define void @v_shuffle_v2i64_v4i64__0_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2630,8 +2590,7 @@ define void @v_shuffle_v2i64_v4i64__1_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2686,8 +2645,7 @@ define void @v_shuffle_v2i64_v4i64__2_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v8
-; GFX942-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2742,8 +2700,7 @@ define void @v_shuffle_v2i64_v4i64__3_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v6
-; GFX942-NEXT:    v_mov_b32_e32 v9, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2828,8 +2785,7 @@ define void @v_shuffle_v2i64_v4i64__5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2874,8 +2830,7 @@ define void @v_shuffle_v2i64_v4i64__6_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v2
-; GFX942-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2970,8 +2925,7 @@ define void @v_shuffle_v2i64_v4i64__0_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3026,8 +2980,7 @@ define void @v_shuffle_v2i64_v4i64__1_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v8
-; GFX942-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3082,8 +3035,7 @@ define void @v_shuffle_v2i64_v4i64__2_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v10
-; GFX942-NEXT:    v_mov_b32_e32 v7, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3138,8 +3090,7 @@ define void @v_shuffle_v2i64_v4i64__3_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v6
-; GFX942-NEXT:    v_mov_b32_e32 v11, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3184,8 +3135,7 @@ define void @v_shuffle_v2i64_v4i64__4_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3270,8 +3220,7 @@ define void @v_shuffle_v2i64_v4i64__6_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v4
-; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3366,8 +3315,7 @@ define void @v_shuffle_v2i64_v4i64__0_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v8
-; GFX942-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3422,8 +3370,7 @@ define void @v_shuffle_v2i64_v4i64__1_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v10
-; GFX942-NEXT:    v_mov_b32_e32 v5, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3478,8 +3425,7 @@ define void @v_shuffle_v2i64_v4i64__2_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v12
-; GFX942-NEXT:    v_mov_b32_e32 v7, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3534,8 +3480,7 @@ define void @v_shuffle_v2i64_v4i64__3_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v6
-; GFX942-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3580,8 +3525,7 @@ define void @v_shuffle_v2i64_v4i64__4_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3626,8 +3570,7 @@ define void @v_shuffle_v2i64_v4i64__5_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3765,8 +3708,7 @@ define void @s_shuffle_v2i64_v4i64__1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3850,8 +3792,7 @@ define void @s_shuffle_v2i64_v4i64__3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3909,8 +3850,7 @@ define void @s_shuffle_v2i64_v4i64__5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3996,8 +3936,7 @@ define void @s_shuffle_v2i64_v4i64__7_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4056,10 +3995,8 @@ define void @s_shuffle_v2i64_v4i64__7_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4113,8 +4050,7 @@ define void @s_shuffle_v2i64_v4i64__7_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4172,10 +4108,8 @@ define void @s_shuffle_v2i64_v4i64__7_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4230,8 +4164,7 @@ define void @s_shuffle_v2i64_v4i64__7_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4280,10 +4213,8 @@ define void @s_shuffle_v2i64_v4i64__7_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4296,18 +4227,43 @@ define void @s_shuffle_v2i64_v4i64__7_4() {
 }
 
 define void @s_shuffle_v2i64_v4i64__7_5() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__7_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__7_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__7_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__7_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 7, i32 5>
@@ -4352,10 +4308,8 @@ define void @s_shuffle_v2i64_v4i64__7_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4368,18 +4322,43 @@ define void @s_shuffle_v2i64_v4i64__7_6() {
 }
 
 define void @s_shuffle_v2i64_v4i64__7_7() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 7, i32 7>
@@ -4420,8 +4399,7 @@ define void @s_shuffle_v2i64_v4i64__u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4433,18 +4411,43 @@ define void @s_shuffle_v2i64_v4i64__u_0() {
 }
 
 define void @s_shuffle_v2i64_v4i64__0_0() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -4488,10 +4491,8 @@ define void @s_shuffle_v2i64_v4i64__1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4503,18 +4504,43 @@ define void @s_shuffle_v2i64_v4i64__1_0() {
 }
 
 define void @s_shuffle_v2i64_v4i64__2_0() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__2_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s4
-; GFX9-NEXT:    s_mov_b32 s11, s5
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__2_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__2_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__2_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 2, i32 0>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -4558,10 +4584,8 @@ define void @s_shuffle_v2i64_v4i64__3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4605,8 +4629,7 @@ define void @s_shuffle_v2i64_v4i64__4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4664,10 +4687,8 @@ define void @s_shuffle_v2i64_v4i64__5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4722,8 +4743,7 @@ define void @s_shuffle_v2i64_v4i64__6_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4816,18 +4836,43 @@ define void @s_shuffle_v2i64_v4i64__0_1() {
 }
 
 define void @s_shuffle_v2i64_v4i64__1_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -4835,18 +4880,43 @@ define void @s_shuffle_v2i64_v4i64__1_1() {
 }
 
 define void @s_shuffle_v2i64_v4i64__2_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__2_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__2_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__2_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__2_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -4854,18 +4924,43 @@ define void @s_shuffle_v2i64_v4i64__2_1() {
 }
 
 define void @s_shuffle_v2i64_v4i64__3_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__3_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__3_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__3_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__3_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 3, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -4954,8 +5049,7 @@ define void @s_shuffle_v2i64_v4i64__5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5010,8 +5104,7 @@ define void @s_shuffle_v2i64_v4i64__6_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5056,8 +5149,7 @@ define void @s_shuffle_v2i64_v4i64__u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5069,18 +5161,43 @@ define void @s_shuffle_v2i64_v4i64__u_2() {
 }
 
 define void @s_shuffle_v2i64_v4i64__0_2() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__0_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__0_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__0_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__0_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 0, i32 2>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -5124,10 +5241,8 @@ define void @s_shuffle_v2i64_v4i64__1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5139,18 +5254,43 @@ define void @s_shuffle_v2i64_v4i64__1_2() {
 }
 
 define void @s_shuffle_v2i64_v4i64__2_2() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -5194,10 +5334,8 @@ define void @s_shuffle_v2i64_v4i64__3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5241,8 +5379,7 @@ define void @s_shuffle_v2i64_v4i64__4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5299,10 +5436,8 @@ define void @s_shuffle_v2i64_v4i64__5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5357,8 +5492,7 @@ define void @s_shuffle_v2i64_v4i64__6_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5411,18 +5545,43 @@ define void @s_shuffle_v2i64_v4i64__u_3() {
 }
 
 define void @s_shuffle_v2i64_v4i64__0_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__0_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__0_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__0_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__0_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 0, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -5430,18 +5589,43 @@ define void @s_shuffle_v2i64_v4i64__0_3() {
 }
 
 define void @s_shuffle_v2i64_v4i64__1_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__1_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__1_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__1_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__1_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 1, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -5489,18 +5673,43 @@ define void @s_shuffle_v2i64_v4i64__2_3() {
 }
 
 define void @s_shuffle_v2i64_v4i64__3_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> <i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -5590,8 +5799,7 @@ define void @s_shuffle_v2i64_v4i64__5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5646,8 +5854,7 @@ define void @s_shuffle_v2i64_v4i64__6_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5746,8 +5953,7 @@ define void @s_shuffle_v2i64_v4i64__1_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5831,8 +6037,7 @@ define void @s_shuffle_v2i64_v4i64__3_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5894,10 +6099,8 @@ define void @s_shuffle_v2i64_v4i64__5_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5910,18 +6113,43 @@ define void @s_shuffle_v2i64_v4i64__5_4() {
 }
 
 define void @s_shuffle_v2i64_v4i64__6_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__6_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s4
-; GFX9-NEXT:    s_mov_b32 s11, s5
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__6_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__6_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__6_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 6, i32 4>
@@ -6012,8 +6240,7 @@ define void @s_shuffle_v2i64_v4i64__0_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6067,8 +6294,7 @@ define void @s_shuffle_v2i64_v4i64__1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6123,8 +6349,7 @@ define void @s_shuffle_v2i64_v4i64__2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6178,8 +6403,7 @@ define void @s_shuffle_v2i64_v4i64__3_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6233,18 +6457,43 @@ define void @s_shuffle_v2i64_v4i64__4_5() {
 }
 
 define void @s_shuffle_v2i64_v4i64__5_5() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 5, i32 5>
@@ -6253,18 +6502,43 @@ define void @s_shuffle_v2i64_v4i64__5_5() {
 }
 
 define void @s_shuffle_v2i64_v4i64__6_5() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__6_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__6_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__6_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__6_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 6, i32 5>
@@ -6305,8 +6579,7 @@ define void @s_shuffle_v2i64_v4i64__u_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6360,8 +6633,7 @@ define void @s_shuffle_v2i64_v4i64__0_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6419,10 +6691,8 @@ define void @s_shuffle_v2i64_v4i64__1_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6477,8 +6747,7 @@ define void @s_shuffle_v2i64_v4i64__2_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6536,10 +6805,8 @@ define void @s_shuffle_v2i64_v4i64__3_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6552,18 +6819,43 @@ define void @s_shuffle_v2i64_v4i64__3_6() {
 }
 
 define void @s_shuffle_v2i64_v4i64__4_6() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__4_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__4_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__4_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__4_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 4, i32 6>
@@ -6608,10 +6900,8 @@ define void @s_shuffle_v2i64_v4i64__5_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6624,18 +6914,43 @@ define void @s_shuffle_v2i64_v4i64__5_6() {
 }
 
 define void @s_shuffle_v2i64_v4i64__6_6() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__6_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 6, i32 6>
@@ -6726,8 +7041,7 @@ define void @s_shuffle_v2i64_v4i64__0_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6782,8 +7096,7 @@ define void @s_shuffle_v2i64_v4i64__1_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6838,8 +7151,7 @@ define void @s_shuffle_v2i64_v4i64__2_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6894,8 +7206,7 @@ define void @s_shuffle_v2i64_v4i64__3_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6908,18 +7219,43 @@ define void @s_shuffle_v2i64_v4i64__3_7() {
 }
 
 define void @s_shuffle_v2i64_v4i64__4_7() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__4_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__4_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__4_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__4_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 4, i32 7>
@@ -6928,18 +7264,43 @@ define void @s_shuffle_v2i64_v4i64__4_7() {
 }
 
 define void @s_shuffle_v2i64_v4i64__5_7() {
-; GFX9-LABEL: s_shuffle_v2i64_v4i64__5_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v4i64__5_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v4i64__5_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v4i64__5_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> <i32 5, i32 7>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
index 7ee7c83e0122d..c8aac3a841c69 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
@@ -322,8 +322,7 @@ define void @v_shuffle_v2i64_v8i64__7_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v14
-; GFX942-NEXT:    v_mov_b32_e32 v1, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -618,8 +617,7 @@ define void @v_shuffle_v2i64_v8i64__15_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v14
-; GFX942-NEXT:    v_mov_b32_e32 v1, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -678,10 +676,8 @@ define void @v_shuffle_v2i64_v8i64__15_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v16
-; GFX942-NEXT:    v_mov_b32_e32 v3, v17
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[16:17]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -736,8 +732,7 @@ define void @v_shuffle_v2i64_v8i64__15_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v18
-; GFX942-NEXT:    v_mov_b32_e32 v1, v19
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -792,8 +787,7 @@ define void @v_shuffle_v2i64_v8i64__15_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v20
-; GFX942-NEXT:    v_mov_b32_e32 v3, v21
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -848,8 +842,7 @@ define void @v_shuffle_v2i64_v8i64__15_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v22
-; GFX942-NEXT:    v_mov_b32_e32 v5, v23
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -904,8 +897,7 @@ define void @v_shuffle_v2i64_v8i64__15_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v24
-; GFX942-NEXT:    v_mov_b32_e32 v7, v25
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -960,8 +952,7 @@ define void @v_shuffle_v2i64_v8i64__15_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v26
-; GFX942-NEXT:    v_mov_b32_e32 v9, v27
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1016,8 +1007,7 @@ define void @v_shuffle_v2i64_v8i64__15_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v28
-; GFX942-NEXT:    v_mov_b32_e32 v11, v29
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1072,8 +1062,7 @@ define void @v_shuffle_v2i64_v8i64__15_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v30
-; GFX942-NEXT:    v_mov_b32_e32 v13, v31
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
 ; GFX942-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1122,10 +1111,8 @@ define void @v_shuffle_v2i64_v8i64__15_8(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v14
-; GFX942-NEXT:    v_mov_b32_e32 v3, v15
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1170,8 +1157,7 @@ define void @v_shuffle_v2i64_v8i64__15_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v14
-; GFX942-NEXT:    v_mov_b32_e32 v1, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1216,8 +1202,7 @@ define void @v_shuffle_v2i64_v8i64__15_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v14
-; GFX942-NEXT:    v_mov_b32_e32 v3, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1262,8 +1247,7 @@ define void @v_shuffle_v2i64_v8i64__15_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v14
-; GFX942-NEXT:    v_mov_b32_e32 v5, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1308,8 +1292,7 @@ define void @v_shuffle_v2i64_v8i64__15_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v14
-; GFX942-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1354,8 +1337,7 @@ define void @v_shuffle_v2i64_v8i64__15_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v14
-; GFX942-NEXT:    v_mov_b32_e32 v9, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1400,8 +1382,7 @@ define void @v_shuffle_v2i64_v8i64__15_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v14
-; GFX942-NEXT:    v_mov_b32_e32 v11, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1446,8 +1427,7 @@ define void @v_shuffle_v2i64_v8i64__15_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v14
-; GFX942-NEXT:    v_mov_b32_e32 v13, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1492,8 +1472,7 @@ define void @v_shuffle_v2i64_v8i64__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1537,8 +1516,7 @@ define void @v_shuffle_v2i64_v8i64__0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1582,8 +1560,7 @@ define void @v_shuffle_v2i64_v8i64__1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1627,8 +1604,7 @@ define void @v_shuffle_v2i64_v8i64__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1672,8 +1648,7 @@ define void @v_shuffle_v2i64_v8i64__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v0
-; GFX942-NEXT:    v_mov_b32_e32 v9, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1717,8 +1692,7 @@ define void @v_shuffle_v2i64_v8i64__4_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v0
-; GFX942-NEXT:    v_mov_b32_e32 v11, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1762,8 +1736,7 @@ define void @v_shuffle_v2i64_v8i64__5_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v0
-; GFX942-NEXT:    v_mov_b32_e32 v13, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1807,8 +1780,7 @@ define void @v_shuffle_v2i64_v8i64__6_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v0
-; GFX942-NEXT:    v_mov_b32_e32 v15, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1856,10 +1828,8 @@ define void @v_shuffle_v2i64_v8i64__7_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v14
-; GFX942-NEXT:    v_mov_b32_e32 v3, v15
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1903,8 +1873,7 @@ define void @v_shuffle_v2i64_v8i64__8_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1958,8 +1927,7 @@ define void @v_shuffle_v2i64_v8i64__9_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2014,8 +1982,7 @@ define void @v_shuffle_v2i64_v8i64__10_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v0
-; GFX942-NEXT:    v_mov_b32_e32 v9, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2070,8 +2037,7 @@ define void @v_shuffle_v2i64_v8i64__11_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v0
-; GFX942-NEXT:    v_mov_b32_e32 v11, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2126,8 +2092,7 @@ define void @v_shuffle_v2i64_v8i64__12_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v0
-; GFX942-NEXT:    v_mov_b32_e32 v13, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2182,8 +2147,7 @@ define void @v_shuffle_v2i64_v8i64__13_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v0
-; GFX942-NEXT:    v_mov_b32_e32 v15, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2238,8 +2202,7 @@ define void @v_shuffle_v2i64_v8i64__14_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v16, v0
-; GFX942-NEXT:    v_mov_b32_e32 v17, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[16:17], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[14:17], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2362,8 +2325,7 @@ define void @v_shuffle_v2i64_v8i64__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2407,8 +2369,7 @@ define void @v_shuffle_v2i64_v8i64__2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v2
-; GFX942-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2452,8 +2413,7 @@ define void @v_shuffle_v2i64_v8i64__3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v2
-; GFX942-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2497,8 +2457,7 @@ define void @v_shuffle_v2i64_v8i64__4_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v2
-; GFX942-NEXT:    v_mov_b32_e32 v11, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2542,8 +2501,7 @@ define void @v_shuffle_v2i64_v8i64__5_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v2
-; GFX942-NEXT:    v_mov_b32_e32 v13, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2587,8 +2545,7 @@ define void @v_shuffle_v2i64_v8i64__6_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v2
-; GFX942-NEXT:    v_mov_b32_e32 v15, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2632,8 +2589,7 @@ define void @v_shuffle_v2i64_v8i64__7_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v14
-; GFX942-NEXT:    v_mov_b32_e32 v1, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2726,8 +2682,7 @@ define void @v_shuffle_v2i64_v8i64__9_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v2
-; GFX942-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2782,8 +2737,7 @@ define void @v_shuffle_v2i64_v8i64__10_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v2
-; GFX942-NEXT:    v_mov_b32_e32 v11, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2838,8 +2792,7 @@ define void @v_shuffle_v2i64_v8i64__11_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v2
-; GFX942-NEXT:    v_mov_b32_e32 v13, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2894,8 +2847,7 @@ define void @v_shuffle_v2i64_v8i64__12_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v2
-; GFX942-NEXT:    v_mov_b32_e32 v15, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2950,8 +2902,7 @@ define void @v_shuffle_v2i64_v8i64__13_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v16, v2
-; GFX942-NEXT:    v_mov_b32_e32 v17, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[16:17], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[14:17], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3006,8 +2957,7 @@ define void @v_shuffle_v2i64_v8i64__14_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v18, v2
-; GFX942-NEXT:    v_mov_b32_e32 v19, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[16:19], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3091,8 +3041,7 @@ define void @v_shuffle_v2i64_v8i64__0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3175,8 +3124,7 @@ define void @v_shuffle_v2i64_v8i64__2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v4
-; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3220,8 +3168,7 @@ define void @v_shuffle_v2i64_v8i64__3_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v4
-; GFX942-NEXT:    v_mov_b32_e32 v9, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3265,8 +3212,7 @@ define void @v_shuffle_v2i64_v8i64__4_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v4
-; GFX942-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3310,8 +3256,7 @@ define void @v_shuffle_v2i64_v8i64__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v4
-; GFX942-NEXT:    v_mov_b32_e32 v13, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3355,8 +3300,7 @@ define void @v_shuffle_v2i64_v8i64__6_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v4
-; GFX942-NEXT:    v_mov_b32_e32 v15, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3400,8 +3344,7 @@ define void @v_shuffle_v2i64_v8i64__7_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v14
-; GFX942-NEXT:    v_mov_b32_e32 v3, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3494,8 +3437,7 @@ define void @v_shuffle_v2i64_v8i64__9_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v4
-; GFX942-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3550,8 +3492,7 @@ define void @v_shuffle_v2i64_v8i64__10_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v4
-; GFX942-NEXT:    v_mov_b32_e32 v13, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3606,8 +3547,7 @@ define void @v_shuffle_v2i64_v8i64__11_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v4
-; GFX942-NEXT:    v_mov_b32_e32 v15, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3662,8 +3602,7 @@ define void @v_shuffle_v2i64_v8i64__12_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v16, v4
-; GFX942-NEXT:    v_mov_b32_e32 v17, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[16:17], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[14:17], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3718,8 +3657,7 @@ define void @v_shuffle_v2i64_v8i64__13_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v18, v4
-; GFX942-NEXT:    v_mov_b32_e32 v19, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[16:19], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3774,8 +3712,7 @@ define void @v_shuffle_v2i64_v8i64__14_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v20, v4
-; GFX942-NEXT:    v_mov_b32_e32 v21, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[18:21], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3859,8 +3796,7 @@ define void @v_shuffle_v2i64_v8i64__0_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3904,8 +3840,7 @@ define void @v_shuffle_v2i64_v8i64__1_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3988,8 +3923,7 @@ define void @v_shuffle_v2i64_v8i64__3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v6
-; GFX942-NEXT:    v_mov_b32_e32 v9, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4033,8 +3967,7 @@ define void @v_shuffle_v2i64_v8i64__4_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v6
-; GFX942-NEXT:    v_mov_b32_e32 v11, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4078,8 +4011,7 @@ define void @v_shuffle_v2i64_v8i64__5_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v6
-; GFX942-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4123,8 +4055,7 @@ define void @v_shuffle_v2i64_v8i64__6_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v6
-; GFX942-NEXT:    v_mov_b32_e32 v15, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4168,8 +4099,7 @@ define void @v_shuffle_v2i64_v8i64__7_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v14
-; GFX942-NEXT:    v_mov_b32_e32 v5, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4262,8 +4192,7 @@ define void @v_shuffle_v2i64_v8i64__9_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v6
-; GFX942-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4318,8 +4247,7 @@ define void @v_shuffle_v2i64_v8i64__10_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v6
-; GFX942-NEXT:    v_mov_b32_e32 v15, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4374,8 +4302,7 @@ define void @v_shuffle_v2i64_v8i64__11_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v16, v6
-; GFX942-NEXT:    v_mov_b32_e32 v17, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[16:17], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[14:17], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4430,8 +4357,7 @@ define void @v_shuffle_v2i64_v8i64__12_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v18, v6
-; GFX942-NEXT:    v_mov_b32_e32 v19, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[16:19], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4486,8 +4412,7 @@ define void @v_shuffle_v2i64_v8i64__13_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v20, v6
-; GFX942-NEXT:    v_mov_b32_e32 v21, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[18:21], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4542,8 +4467,7 @@ define void @v_shuffle_v2i64_v8i64__14_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v22, v6
-; GFX942-NEXT:    v_mov_b32_e32 v23, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[22:23], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[20:23], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4627,8 +4551,7 @@ define void @v_shuffle_v2i64_v8i64__0_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v8
-; GFX942-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4672,8 +4595,7 @@ define void @v_shuffle_v2i64_v8i64__1_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v8
-; GFX942-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4717,8 +4639,7 @@ define void @v_shuffle_v2i64_v8i64__2_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v8
-; GFX942-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4801,8 +4722,7 @@ define void @v_shuffle_v2i64_v8i64__4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v8
-; GFX942-NEXT:    v_mov_b32_e32 v11, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4846,8 +4766,7 @@ define void @v_shuffle_v2i64_v8i64__5_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v8
-; GFX942-NEXT:    v_mov_b32_e32 v13, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4891,8 +4810,7 @@ define void @v_shuffle_v2i64_v8i64__6_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v8
-; GFX942-NEXT:    v_mov_b32_e32 v15, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -4936,8 +4854,7 @@ define void @v_shuffle_v2i64_v8i64__7_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v14
-; GFX942-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5030,8 +4947,7 @@ define void @v_shuffle_v2i64_v8i64__9_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v8
-; GFX942-NEXT:    v_mov_b32_e32 v15, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5086,8 +5002,7 @@ define void @v_shuffle_v2i64_v8i64__10_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v16, v8
-; GFX942-NEXT:    v_mov_b32_e32 v17, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[16:17], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[14:17], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5142,8 +5057,7 @@ define void @v_shuffle_v2i64_v8i64__11_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v18, v8
-; GFX942-NEXT:    v_mov_b32_e32 v19, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[16:19], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5198,8 +5112,7 @@ define void @v_shuffle_v2i64_v8i64__12_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v20, v8
-; GFX942-NEXT:    v_mov_b32_e32 v21, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[18:21], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5254,8 +5167,7 @@ define void @v_shuffle_v2i64_v8i64__13_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v22, v8
-; GFX942-NEXT:    v_mov_b32_e32 v23, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[22:23], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[20:23], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5310,8 +5222,7 @@ define void @v_shuffle_v2i64_v8i64__14_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v24, v8
-; GFX942-NEXT:    v_mov_b32_e32 v25, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[24:25], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[22:25], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5395,8 +5306,7 @@ define void @v_shuffle_v2i64_v8i64__0_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v10
-; GFX942-NEXT:    v_mov_b32_e32 v3, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5440,8 +5350,7 @@ define void @v_shuffle_v2i64_v8i64__1_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v10
-; GFX942-NEXT:    v_mov_b32_e32 v5, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5485,8 +5394,7 @@ define void @v_shuffle_v2i64_v8i64__2_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v10
-; GFX942-NEXT:    v_mov_b32_e32 v7, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5530,8 +5438,7 @@ define void @v_shuffle_v2i64_v8i64__3_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v10
-; GFX942-NEXT:    v_mov_b32_e32 v9, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5614,8 +5521,7 @@ define void @v_shuffle_v2i64_v8i64__5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v10
-; GFX942-NEXT:    v_mov_b32_e32 v13, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5659,8 +5565,7 @@ define void @v_shuffle_v2i64_v8i64__6_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v10
-; GFX942-NEXT:    v_mov_b32_e32 v15, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5704,8 +5609,7 @@ define void @v_shuffle_v2i64_v8i64__7_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v14
-; GFX942-NEXT:    v_mov_b32_e32 v9, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5798,8 +5702,7 @@ define void @v_shuffle_v2i64_v8i64__9_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v16, v10
-; GFX942-NEXT:    v_mov_b32_e32 v17, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[16:17], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[14:17], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5854,8 +5757,7 @@ define void @v_shuffle_v2i64_v8i64__10_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v18, v10
-; GFX942-NEXT:    v_mov_b32_e32 v19, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[16:19], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5910,8 +5812,7 @@ define void @v_shuffle_v2i64_v8i64__11_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v20, v10
-; GFX942-NEXT:    v_mov_b32_e32 v21, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[18:21], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -5966,8 +5867,7 @@ define void @v_shuffle_v2i64_v8i64__12_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v22, v10
-; GFX942-NEXT:    v_mov_b32_e32 v23, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[22:23], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[20:23], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6022,8 +5922,7 @@ define void @v_shuffle_v2i64_v8i64__13_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v24, v10
-; GFX942-NEXT:    v_mov_b32_e32 v25, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[24:25], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[22:25], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6078,8 +5977,7 @@ define void @v_shuffle_v2i64_v8i64__14_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v26, v10
-; GFX942-NEXT:    v_mov_b32_e32 v27, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[26:27], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[24:27], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6163,8 +6061,7 @@ define void @v_shuffle_v2i64_v8i64__0_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v12
-; GFX942-NEXT:    v_mov_b32_e32 v3, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6208,8 +6105,7 @@ define void @v_shuffle_v2i64_v8i64__1_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v12
-; GFX942-NEXT:    v_mov_b32_e32 v5, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6253,8 +6149,7 @@ define void @v_shuffle_v2i64_v8i64__2_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v12
-; GFX942-NEXT:    v_mov_b32_e32 v7, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6298,8 +6193,7 @@ define void @v_shuffle_v2i64_v8i64__3_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v12
-; GFX942-NEXT:    v_mov_b32_e32 v9, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6343,8 +6237,7 @@ define void @v_shuffle_v2i64_v8i64__4_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v12
-; GFX942-NEXT:    v_mov_b32_e32 v11, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6427,8 +6320,7 @@ define void @v_shuffle_v2i64_v8i64__6_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v12
-; GFX942-NEXT:    v_mov_b32_e32 v15, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6472,8 +6364,7 @@ define void @v_shuffle_v2i64_v8i64__7_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v14
-; GFX942-NEXT:    v_mov_b32_e32 v11, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6566,8 +6457,7 @@ define void @v_shuffle_v2i64_v8i64__9_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v18, v12
-; GFX942-NEXT:    v_mov_b32_e32 v19, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[16:19], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6622,8 +6512,7 @@ define void @v_shuffle_v2i64_v8i64__10_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v20, v12
-; GFX942-NEXT:    v_mov_b32_e32 v21, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[18:21], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6678,8 +6567,7 @@ define void @v_shuffle_v2i64_v8i64__11_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v22, v12
-; GFX942-NEXT:    v_mov_b32_e32 v23, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[22:23], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[20:23], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6734,8 +6622,7 @@ define void @v_shuffle_v2i64_v8i64__12_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v24, v12
-; GFX942-NEXT:    v_mov_b32_e32 v25, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[24:25], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[22:25], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6790,8 +6677,7 @@ define void @v_shuffle_v2i64_v8i64__13_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v26, v12
-; GFX942-NEXT:    v_mov_b32_e32 v27, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[26:27], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[24:27], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6846,8 +6732,7 @@ define void @v_shuffle_v2i64_v8i64__14_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v28, v12
-; GFX942-NEXT:    v_mov_b32_e32 v29, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[28:29], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[26:29], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6931,8 +6816,7 @@ define void @v_shuffle_v2i64_v8i64__0_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v14
-; GFX942-NEXT:    v_mov_b32_e32 v3, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -6976,8 +6860,7 @@ define void @v_shuffle_v2i64_v8i64__1_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v14
-; GFX942-NEXT:    v_mov_b32_e32 v5, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7021,8 +6904,7 @@ define void @v_shuffle_v2i64_v8i64__2_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v14
-; GFX942-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7066,8 +6948,7 @@ define void @v_shuffle_v2i64_v8i64__3_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v14
-; GFX942-NEXT:    v_mov_b32_e32 v9, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7111,8 +6992,7 @@ define void @v_shuffle_v2i64_v8i64__4_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v14
-; GFX942-NEXT:    v_mov_b32_e32 v11, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7156,8 +7036,7 @@ define void @v_shuffle_v2i64_v8i64__5_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v14
-; GFX942-NEXT:    v_mov_b32_e32 v13, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7240,8 +7119,7 @@ define void @v_shuffle_v2i64_v8i64__7_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v14
-; GFX942-NEXT:    v_mov_b32_e32 v13, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7334,8 +7212,7 @@ define void @v_shuffle_v2i64_v8i64__9_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v20, v14
-; GFX942-NEXT:    v_mov_b32_e32 v21, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v32, v[18:21], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7390,8 +7267,7 @@ define void @v_shuffle_v2i64_v8i64__10_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v22, v14
-; GFX942-NEXT:    v_mov_b32_e32 v23, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[22:23], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7446,8 +7322,7 @@ define void @v_shuffle_v2i64_v8i64__11_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v24, v14
-; GFX942-NEXT:    v_mov_b32_e32 v25, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[24:25], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v32, v[22:25], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7502,8 +7377,7 @@ define void @v_shuffle_v2i64_v8i64__12_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v26, v14
-; GFX942-NEXT:    v_mov_b32_e32 v27, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[26:27], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7558,8 +7432,7 @@ define void @v_shuffle_v2i64_v8i64__13_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v28, v14
-; GFX942-NEXT:    v_mov_b32_e32 v29, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[28:29], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v32, v[26:29], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7614,8 +7487,7 @@ define void @v_shuffle_v2i64_v8i64__14_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v30, v14
-; GFX942-NEXT:    v_mov_b32_e32 v31, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[30:31], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -7944,8 +7816,7 @@ define void @v_shuffle_v2i64_v8i64__7_8(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v14
-; GFX942-NEXT:    v_mov_b32_e32 v1, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8000,8 +7871,7 @@ define void @v_shuffle_v2i64_v8i64__9_8(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8046,8 +7916,7 @@ define void @v_shuffle_v2i64_v8i64__10_8(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8092,8 +7961,7 @@ define void @v_shuffle_v2i64_v8i64__11_8(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v0
-; GFX942-NEXT:    v_mov_b32_e32 v9, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8138,8 +8006,7 @@ define void @v_shuffle_v2i64_v8i64__12_8(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v0
-; GFX942-NEXT:    v_mov_b32_e32 v11, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8184,8 +8051,7 @@ define void @v_shuffle_v2i64_v8i64__13_8(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v0
-; GFX942-NEXT:    v_mov_b32_e32 v13, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8230,8 +8096,7 @@ define void @v_shuffle_v2i64_v8i64__14_8(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v0
-; GFX942-NEXT:    v_mov_b32_e32 v15, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8326,8 +8191,7 @@ define void @v_shuffle_v2i64_v8i64__0_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8382,8 +8246,7 @@ define void @v_shuffle_v2i64_v8i64__1_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8438,8 +8301,7 @@ define void @v_shuffle_v2i64_v8i64__2_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v8
-; GFX942-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8494,8 +8356,7 @@ define void @v_shuffle_v2i64_v8i64__3_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v10
-; GFX942-NEXT:    v_mov_b32_e32 v9, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8550,8 +8411,7 @@ define void @v_shuffle_v2i64_v8i64__4_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v12
-; GFX942-NEXT:    v_mov_b32_e32 v11, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8606,8 +8466,7 @@ define void @v_shuffle_v2i64_v8i64__5_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v14
-; GFX942-NEXT:    v_mov_b32_e32 v13, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8662,8 +8521,7 @@ define void @v_shuffle_v2i64_v8i64__6_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v16
-; GFX942-NEXT:    v_mov_b32_e32 v15, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8718,8 +8576,7 @@ define void @v_shuffle_v2i64_v8i64__7_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v16
-; GFX942-NEXT:    v_mov_b32_e32 v1, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8804,8 +8661,7 @@ define void @v_shuffle_v2i64_v8i64__9_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8850,8 +8706,7 @@ define void @v_shuffle_v2i64_v8i64__10_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v2
-; GFX942-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8896,8 +8751,7 @@ define void @v_shuffle_v2i64_v8i64__11_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v2
-; GFX942-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8942,8 +8796,7 @@ define void @v_shuffle_v2i64_v8i64__12_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v2
-; GFX942-NEXT:    v_mov_b32_e32 v11, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -8988,8 +8841,7 @@ define void @v_shuffle_v2i64_v8i64__13_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v2
-; GFX942-NEXT:    v_mov_b32_e32 v13, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9034,8 +8886,7 @@ define void @v_shuffle_v2i64_v8i64__14_9(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v2
-; GFX942-NEXT:    v_mov_b32_e32 v15, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9130,8 +8981,7 @@ define void @v_shuffle_v2i64_v8i64__0_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9186,8 +9036,7 @@ define void @v_shuffle_v2i64_v8i64__1_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v8
-; GFX942-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9242,8 +9091,7 @@ define void @v_shuffle_v2i64_v8i64__2_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v10
-; GFX942-NEXT:    v_mov_b32_e32 v7, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9298,8 +9146,7 @@ define void @v_shuffle_v2i64_v8i64__3_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v12
-; GFX942-NEXT:    v_mov_b32_e32 v9, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9354,8 +9201,7 @@ define void @v_shuffle_v2i64_v8i64__4_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v14
-; GFX942-NEXT:    v_mov_b32_e32 v11, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9410,8 +9256,7 @@ define void @v_shuffle_v2i64_v8i64__5_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v16
-; GFX942-NEXT:    v_mov_b32_e32 v13, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9466,8 +9311,7 @@ define void @v_shuffle_v2i64_v8i64__6_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v18
-; GFX942-NEXT:    v_mov_b32_e32 v15, v19
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[18:19]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9522,8 +9366,7 @@ define void @v_shuffle_v2i64_v8i64__7_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v16
-; GFX942-NEXT:    v_mov_b32_e32 v3, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9568,8 +9411,7 @@ define void @v_shuffle_v2i64_v8i64__8_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9654,8 +9496,7 @@ define void @v_shuffle_v2i64_v8i64__10_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v4
-; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9700,8 +9541,7 @@ define void @v_shuffle_v2i64_v8i64__11_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v4
-; GFX942-NEXT:    v_mov_b32_e32 v9, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9746,8 +9586,7 @@ define void @v_shuffle_v2i64_v8i64__12_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v4
-; GFX942-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9792,8 +9631,7 @@ define void @v_shuffle_v2i64_v8i64__13_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v4
-; GFX942-NEXT:    v_mov_b32_e32 v13, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9838,8 +9676,7 @@ define void @v_shuffle_v2i64_v8i64__14_10(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v4
-; GFX942-NEXT:    v_mov_b32_e32 v15, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9934,8 +9771,7 @@ define void @v_shuffle_v2i64_v8i64__0_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v8
-; GFX942-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -9990,8 +9826,7 @@ define void @v_shuffle_v2i64_v8i64__1_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v10
-; GFX942-NEXT:    v_mov_b32_e32 v5, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10046,8 +9881,7 @@ define void @v_shuffle_v2i64_v8i64__2_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v12
-; GFX942-NEXT:    v_mov_b32_e32 v7, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10102,8 +9936,7 @@ define void @v_shuffle_v2i64_v8i64__3_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v14
-; GFX942-NEXT:    v_mov_b32_e32 v9, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10158,8 +9991,7 @@ define void @v_shuffle_v2i64_v8i64__4_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v16
-; GFX942-NEXT:    v_mov_b32_e32 v11, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10214,8 +10046,7 @@ define void @v_shuffle_v2i64_v8i64__5_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v18
-; GFX942-NEXT:    v_mov_b32_e32 v13, v19
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[18:19]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10270,8 +10101,7 @@ define void @v_shuffle_v2i64_v8i64__6_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v20
-; GFX942-NEXT:    v_mov_b32_e32 v15, v21
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[20:21]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10326,8 +10156,7 @@ define void @v_shuffle_v2i64_v8i64__7_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v16
-; GFX942-NEXT:    v_mov_b32_e32 v5, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10372,8 +10201,7 @@ define void @v_shuffle_v2i64_v8i64__8_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10418,8 +10246,7 @@ define void @v_shuffle_v2i64_v8i64__9_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10504,8 +10331,7 @@ define void @v_shuffle_v2i64_v8i64__11_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v6
-; GFX942-NEXT:    v_mov_b32_e32 v9, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10550,8 +10376,7 @@ define void @v_shuffle_v2i64_v8i64__12_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v6
-; GFX942-NEXT:    v_mov_b32_e32 v11, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10596,8 +10421,7 @@ define void @v_shuffle_v2i64_v8i64__13_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v6
-; GFX942-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10642,8 +10466,7 @@ define void @v_shuffle_v2i64_v8i64__14_11(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v6
-; GFX942-NEXT:    v_mov_b32_e32 v15, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10738,8 +10561,7 @@ define void @v_shuffle_v2i64_v8i64__0_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v10
-; GFX942-NEXT:    v_mov_b32_e32 v3, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10794,8 +10616,7 @@ define void @v_shuffle_v2i64_v8i64__1_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v12
-; GFX942-NEXT:    v_mov_b32_e32 v5, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10850,8 +10671,7 @@ define void @v_shuffle_v2i64_v8i64__2_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v14
-; GFX942-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10906,8 +10726,7 @@ define void @v_shuffle_v2i64_v8i64__3_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v16
-; GFX942-NEXT:    v_mov_b32_e32 v9, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -10962,8 +10781,7 @@ define void @v_shuffle_v2i64_v8i64__4_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v18
-; GFX942-NEXT:    v_mov_b32_e32 v11, v19
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[18:19]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11018,8 +10836,7 @@ define void @v_shuffle_v2i64_v8i64__5_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v20
-; GFX942-NEXT:    v_mov_b32_e32 v13, v21
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[20:21]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11074,8 +10891,7 @@ define void @v_shuffle_v2i64_v8i64__6_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v22
-; GFX942-NEXT:    v_mov_b32_e32 v15, v23
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[22:23]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11130,8 +10946,7 @@ define void @v_shuffle_v2i64_v8i64__7_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v16
-; GFX942-NEXT:    v_mov_b32_e32 v7, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11176,8 +10991,7 @@ define void @v_shuffle_v2i64_v8i64__8_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v8
-; GFX942-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11222,8 +11036,7 @@ define void @v_shuffle_v2i64_v8i64__9_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v8
-; GFX942-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11268,8 +11081,7 @@ define void @v_shuffle_v2i64_v8i64__10_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v8
-; GFX942-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11354,8 +11166,7 @@ define void @v_shuffle_v2i64_v8i64__12_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v8
-; GFX942-NEXT:    v_mov_b32_e32 v11, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11400,8 +11211,7 @@ define void @v_shuffle_v2i64_v8i64__13_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v8
-; GFX942-NEXT:    v_mov_b32_e32 v13, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11446,8 +11256,7 @@ define void @v_shuffle_v2i64_v8i64__14_12(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v8
-; GFX942-NEXT:    v_mov_b32_e32 v15, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11542,8 +11351,7 @@ define void @v_shuffle_v2i64_v8i64__0_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v12
-; GFX942-NEXT:    v_mov_b32_e32 v3, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11598,8 +11406,7 @@ define void @v_shuffle_v2i64_v8i64__1_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v14
-; GFX942-NEXT:    v_mov_b32_e32 v5, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11654,8 +11461,7 @@ define void @v_shuffle_v2i64_v8i64__2_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v16
-; GFX942-NEXT:    v_mov_b32_e32 v7, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11710,8 +11516,7 @@ define void @v_shuffle_v2i64_v8i64__3_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v18
-; GFX942-NEXT:    v_mov_b32_e32 v9, v19
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[18:19]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11766,8 +11571,7 @@ define void @v_shuffle_v2i64_v8i64__4_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v20
-; GFX942-NEXT:    v_mov_b32_e32 v11, v21
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[20:21]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11822,8 +11626,7 @@ define void @v_shuffle_v2i64_v8i64__5_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v22
-; GFX942-NEXT:    v_mov_b32_e32 v13, v23
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[22:23]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11878,8 +11681,7 @@ define void @v_shuffle_v2i64_v8i64__6_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v24
-; GFX942-NEXT:    v_mov_b32_e32 v15, v25
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[24:25]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11934,8 +11736,7 @@ define void @v_shuffle_v2i64_v8i64__7_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v16
-; GFX942-NEXT:    v_mov_b32_e32 v9, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -11980,8 +11781,7 @@ define void @v_shuffle_v2i64_v8i64__8_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v10
-; GFX942-NEXT:    v_mov_b32_e32 v3, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12026,8 +11826,7 @@ define void @v_shuffle_v2i64_v8i64__9_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v10
-; GFX942-NEXT:    v_mov_b32_e32 v5, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12072,8 +11871,7 @@ define void @v_shuffle_v2i64_v8i64__10_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v10
-; GFX942-NEXT:    v_mov_b32_e32 v7, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12118,8 +11916,7 @@ define void @v_shuffle_v2i64_v8i64__11_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v10
-; GFX942-NEXT:    v_mov_b32_e32 v9, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12204,8 +12001,7 @@ define void @v_shuffle_v2i64_v8i64__13_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v10
-; GFX942-NEXT:    v_mov_b32_e32 v13, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12250,8 +12046,7 @@ define void @v_shuffle_v2i64_v8i64__14_13(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v10
-; GFX942-NEXT:    v_mov_b32_e32 v15, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12346,8 +12141,7 @@ define void @v_shuffle_v2i64_v8i64__0_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v14
-; GFX942-NEXT:    v_mov_b32_e32 v3, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12402,8 +12196,7 @@ define void @v_shuffle_v2i64_v8i64__1_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v16
-; GFX942-NEXT:    v_mov_b32_e32 v5, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12458,8 +12251,7 @@ define void @v_shuffle_v2i64_v8i64__2_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v18
-; GFX942-NEXT:    v_mov_b32_e32 v7, v19
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12514,8 +12306,7 @@ define void @v_shuffle_v2i64_v8i64__3_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v20
-; GFX942-NEXT:    v_mov_b32_e32 v9, v21
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12570,8 +12361,7 @@ define void @v_shuffle_v2i64_v8i64__4_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v22
-; GFX942-NEXT:    v_mov_b32_e32 v11, v23
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12626,8 +12416,7 @@ define void @v_shuffle_v2i64_v8i64__5_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v24
-; GFX942-NEXT:    v_mov_b32_e32 v13, v25
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12682,8 +12471,7 @@ define void @v_shuffle_v2i64_v8i64__6_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v26
-; GFX942-NEXT:    v_mov_b32_e32 v15, v27
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12738,8 +12526,7 @@ define void @v_shuffle_v2i64_v8i64__7_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v16
-; GFX942-NEXT:    v_mov_b32_e32 v11, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12784,8 +12571,7 @@ define void @v_shuffle_v2i64_v8i64__8_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v12
-; GFX942-NEXT:    v_mov_b32_e32 v3, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12830,8 +12616,7 @@ define void @v_shuffle_v2i64_v8i64__9_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v12
-; GFX942-NEXT:    v_mov_b32_e32 v5, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12876,8 +12661,7 @@ define void @v_shuffle_v2i64_v8i64__10_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v12
-; GFX942-NEXT:    v_mov_b32_e32 v7, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12922,8 +12706,7 @@ define void @v_shuffle_v2i64_v8i64__11_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v12
-; GFX942-NEXT:    v_mov_b32_e32 v9, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -12968,8 +12751,7 @@ define void @v_shuffle_v2i64_v8i64__12_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v12
-; GFX942-NEXT:    v_mov_b32_e32 v11, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13054,8 +12836,7 @@ define void @v_shuffle_v2i64_v8i64__14_14(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v12
-; GFX942-NEXT:    v_mov_b32_e32 v15, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13150,8 +12931,7 @@ define void @v_shuffle_v2i64_v8i64__0_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:17]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v16
-; GFX942-NEXT:    v_mov_b32_e32 v3, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13206,8 +12986,7 @@ define void @v_shuffle_v2i64_v8i64__1_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v18
-; GFX942-NEXT:    v_mov_b32_e32 v5, v19
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[18:19]
 ; GFX942-NEXT:    global_store_dwordx4 v20, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13262,8 +13041,7 @@ define void @v_shuffle_v2i64_v8i64__2_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:21]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v20
-; GFX942-NEXT:    v_mov_b32_e32 v7, v21
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[20:21]
 ; GFX942-NEXT:    global_store_dwordx4 v22, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13318,8 +13096,7 @@ define void @v_shuffle_v2i64_v8i64__3_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v22
-; GFX942-NEXT:    v_mov_b32_e32 v9, v23
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[22:23]
 ; GFX942-NEXT:    global_store_dwordx4 v24, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13374,8 +13151,7 @@ define void @v_shuffle_v2i64_v8i64__4_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[10:25]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v24
-; GFX942-NEXT:    v_mov_b32_e32 v11, v25
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[24:25]
 ; GFX942-NEXT:    global_store_dwordx4 v26, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13430,8 +13206,7 @@ define void @v_shuffle_v2i64_v8i64__5_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v26
-; GFX942-NEXT:    v_mov_b32_e32 v13, v27
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[26:27]
 ; GFX942-NEXT:    global_store_dwordx4 v28, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13486,8 +13261,7 @@ define void @v_shuffle_v2i64_v8i64__6_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[14:29]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v28
-; GFX942-NEXT:    v_mov_b32_e32 v15, v29
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[28:29]
 ; GFX942-NEXT:    global_store_dwordx4 v30, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13542,8 +13316,7 @@ define void @v_shuffle_v2i64_v8i64__7_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v16
-; GFX942-NEXT:    v_mov_b32_e32 v13, v17
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[16:17]
 ; GFX942-NEXT:    global_store_dwordx4 v18, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13588,8 +13361,7 @@ define void @v_shuffle_v2i64_v8i64__8_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v14
-; GFX942-NEXT:    v_mov_b32_e32 v3, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13634,8 +13406,7 @@ define void @v_shuffle_v2i64_v8i64__9_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v14
-; GFX942-NEXT:    v_mov_b32_e32 v5, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13680,8 +13451,7 @@ define void @v_shuffle_v2i64_v8i64__10_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v14
-; GFX942-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13726,8 +13496,7 @@ define void @v_shuffle_v2i64_v8i64__11_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v14
-; GFX942-NEXT:    v_mov_b32_e32 v9, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13772,8 +13541,7 @@ define void @v_shuffle_v2i64_v8i64__12_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v14
-; GFX942-NEXT:    v_mov_b32_e32 v11, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13818,8 +13586,7 @@ define void @v_shuffle_v2i64_v8i64__13_15(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v14
-; GFX942-NEXT:    v_mov_b32_e32 v13, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -13957,8 +13724,7 @@ define void @s_shuffle_v2i64_v8i64__1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14042,8 +13808,7 @@ define void @s_shuffle_v2i64_v8i64__3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14131,8 +13896,7 @@ define void @s_shuffle_v2i64_v8i64__5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14221,8 +13985,7 @@ define void @s_shuffle_v2i64_v8i64__7_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14280,8 +14043,7 @@ define void @s_shuffle_v2i64_v8i64__9_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14367,8 +14129,7 @@ define void @s_shuffle_v2i64_v8i64__11_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14458,8 +14219,7 @@ define void @s_shuffle_v2i64_v8i64__13_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14550,8 +14310,7 @@ define void @s_shuffle_v2i64_v8i64__15_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14610,10 +14369,8 @@ define void @s_shuffle_v2i64_v8i64__15_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14668,8 +14425,7 @@ define void @s_shuffle_v2i64_v8i64__15_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s26
-; GFX942-NEXT:    s_mov_b32 s9, s27
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14750,10 +14506,8 @@ define void @s_shuffle_v2i64_v8i64__15_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14808,8 +14562,7 @@ define void @s_shuffle_v2i64_v8i64__15_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s26
-; GFX942-NEXT:    s_mov_b32 s9, s27
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14894,10 +14647,8 @@ define void @s_shuffle_v2i64_v8i64__15_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s30
-; GFX942-NEXT:    s_mov_b32 s9, s31
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[30:31]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14984,8 +14735,7 @@ define void @s_shuffle_v2i64_v8i64__15_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s26
-; GFX942-NEXT:    s_mov_b32 s9, s27
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15094,10 +14844,8 @@ define void @s_shuffle_v2i64_v8i64__15_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s30
-; GFX942-NEXT:    s_mov_b32 s9, s31
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[30:31]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15212,8 +14960,7 @@ define void @s_shuffle_v2i64_v8i64__15_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s30
-; GFX942-NEXT:    s_mov_b32 s13, s31
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[30:31]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -15270,10 +15017,8 @@ define void @s_shuffle_v2i64_v8i64__15_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15286,18 +15031,43 @@ define void @s_shuffle_v2i64_v8i64__15_8() {
 }
 
 define void @s_shuffle_v2i64_v8i64__15_9() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__15_9:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s22
-; GFX9-NEXT:    s_mov_b32 s9, s23
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_9:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_9:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_9:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 15, i32 9>
@@ -15342,10 +15112,8 @@ define void @s_shuffle_v2i64_v8i64__15_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15358,18 +15126,43 @@ define void @s_shuffle_v2i64_v8i64__15_10() {
 }
 
 define void @s_shuffle_v2i64_v8i64__15_11() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__15_11:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s18
-; GFX9-NEXT:    s_mov_b32 s9, s19
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_11:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_11:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_11:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 15, i32 11>
@@ -15378,20 +15171,48 @@ define void @s_shuffle_v2i64_v8i64__15_11() {
 }
 
 define void @s_shuffle_v2i64_v8i64__15_12() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__15_12:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s18
-; GFX9-NEXT:    s_mov_b32 s9, s19
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_12:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_12:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__15_12:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 15, i32 12>
@@ -15436,8 +15257,7 @@ define void @s_shuffle_v2i64_v8i64__15_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15486,10 +15306,8 @@ define void @s_shuffle_v2i64_v8i64__15_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15538,8 +15356,7 @@ define void @s_shuffle_v2i64_v8i64__15_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -15586,8 +15403,7 @@ define void @s_shuffle_v2i64_v8i64__u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15599,18 +15415,43 @@ define void @s_shuffle_v2i64_v8i64__u_0() {
 }
 
 define void @s_shuffle_v2i64_v8i64__0_0() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -15654,10 +15495,8 @@ define void @s_shuffle_v2i64_v8i64__1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15669,18 +15508,43 @@ define void @s_shuffle_v2i64_v8i64__1_0() {
 }
 
 define void @s_shuffle_v2i64_v8i64__2_0() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s4
-; GFX9-NEXT:    s_mov_b32 s11, s5
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 2, i32 0>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -15724,10 +15588,8 @@ define void @s_shuffle_v2i64_v8i64__3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15775,8 +15637,7 @@ define void @s_shuffle_v2i64_v8i64__4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15824,10 +15685,8 @@ define void @s_shuffle_v2i64_v8i64__5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15875,8 +15734,7 @@ define void @s_shuffle_v2i64_v8i64__6_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -15926,10 +15784,8 @@ define void @s_shuffle_v2i64_v8i64__7_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15973,8 +15829,7 @@ define void @s_shuffle_v2i64_v8i64__8_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16032,10 +15887,8 @@ define void @s_shuffle_v2i64_v8i64__9_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16136,8 +15989,7 @@ define void @s_shuffle_v2i64_v8i64__10_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16196,10 +16048,8 @@ define void @s_shuffle_v2i64_v8i64__11_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16262,8 +16112,7 @@ define void @s_shuffle_v2i64_v8i64__12_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16328,10 +16177,8 @@ define void @s_shuffle_v2i64_v8i64__13_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16390,8 +16237,7 @@ define void @s_shuffle_v2i64_v8i64__14_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s18, s0
-; GFX942-NEXT:    s_mov_b32 s19, s1
+; GFX942-NEXT:    s_mov_b64 s[18:19], s[0:1]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -16486,18 +16332,43 @@ define void @s_shuffle_v2i64_v8i64__0_1() {
 }
 
 define void @s_shuffle_v2i64_v8i64__1_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -16505,18 +16376,43 @@ define void @s_shuffle_v2i64_v8i64__1_1() {
 }
 
 define void @s_shuffle_v2i64_v8i64__2_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -16524,18 +16420,43 @@ define void @s_shuffle_v2i64_v8i64__2_1() {
 }
 
 define void @s_shuffle_v2i64_v8i64__3_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__3_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 3, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -16579,8 +16500,7 @@ define void @s_shuffle_v2i64_v8i64__4_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16592,18 +16512,43 @@ define void @s_shuffle_v2i64_v8i64__4_1() {
 }
 
 define void @s_shuffle_v2i64_v8i64__5_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__5_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s18
-; GFX9-NEXT:    s_mov_b32 s9, s19
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 5, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -16647,8 +16592,7 @@ define void @s_shuffle_v2i64_v8i64__6_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -16662,18 +16606,43 @@ define void @s_shuffle_v2i64_v8i64__6_1() {
 }
 
 define void @s_shuffle_v2i64_v8i64__7_1() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__7_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s22
-; GFX9-NEXT:    s_mov_b32 s9, s23
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 7, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -16763,8 +16732,7 @@ define void @s_shuffle_v2i64_v8i64__9_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16865,8 +16833,7 @@ define void @s_shuffle_v2i64_v8i64__10_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16921,8 +16888,7 @@ define void @s_shuffle_v2i64_v8i64__11_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16985,8 +16951,7 @@ define void @s_shuffle_v2i64_v8i64__12_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17047,8 +17012,7 @@ define void @s_shuffle_v2i64_v8i64__13_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17107,8 +17071,7 @@ define void @s_shuffle_v2i64_v8i64__14_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s18, s2
-; GFX942-NEXT:    s_mov_b32 s19, s3
+; GFX942-NEXT:    s_mov_b64 s[18:19], s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -17155,8 +17118,7 @@ define void @s_shuffle_v2i64_v8i64__u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17168,18 +17130,43 @@ define void @s_shuffle_v2i64_v8i64__u_2() {
 }
 
 define void @s_shuffle_v2i64_v8i64__0_2() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 0, i32 2>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -17223,10 +17210,8 @@ define void @s_shuffle_v2i64_v8i64__1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17238,18 +17223,43 @@ define void @s_shuffle_v2i64_v8i64__1_2() {
 }
 
 define void @s_shuffle_v2i64_v8i64__2_2() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -17293,10 +17303,8 @@ define void @s_shuffle_v2i64_v8i64__3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17344,8 +17352,7 @@ define void @s_shuffle_v2i64_v8i64__4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17393,10 +17400,8 @@ define void @s_shuffle_v2i64_v8i64__5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17444,8 +17449,7 @@ define void @s_shuffle_v2i64_v8i64__6_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -17495,10 +17499,8 @@ define void @s_shuffle_v2i64_v8i64__7_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17542,8 +17544,7 @@ define void @s_shuffle_v2i64_v8i64__8_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17623,10 +17624,8 @@ define void @s_shuffle_v2i64_v8i64__9_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17708,8 +17707,7 @@ define void @s_shuffle_v2i64_v8i64__10_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17796,10 +17794,8 @@ define void @s_shuffle_v2i64_v8i64__11_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17858,8 +17854,7 @@ define void @s_shuffle_v2i64_v8i64__12_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17940,10 +17935,8 @@ define void @s_shuffle_v2i64_v8i64__13_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -18002,8 +17995,7 @@ define void @s_shuffle_v2i64_v8i64__14_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s22, s4
-; GFX942-NEXT:    s_mov_b32 s23, s5
+; GFX942-NEXT:    s_mov_b64 s[22:23], s[4:5]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18058,37 +18050,87 @@ define void @s_shuffle_v2i64_v8i64__u_3() {
 }
 
 define void @s_shuffle_v2i64_v8i64__0_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 0, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v2i64_v8i64__1_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__1_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <8 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 0, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v2i64_v8i64__1_3() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 1, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -18136,18 +18178,43 @@ define void @s_shuffle_v2i64_v8i64__2_3() {
 }
 
 define void @s_shuffle_v2i64_v8i64__3_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -18191,8 +18258,7 @@ define void @s_shuffle_v2i64_v8i64__4_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -18204,18 +18270,43 @@ define void @s_shuffle_v2i64_v8i64__4_3() {
 }
 
 define void @s_shuffle_v2i64_v8i64__5_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__5_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 5, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -18259,8 +18350,7 @@ define void @s_shuffle_v2i64_v8i64__6_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18274,18 +18364,43 @@ define void @s_shuffle_v2i64_v8i64__6_3() {
 }
 
 define void @s_shuffle_v2i64_v8i64__7_3() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__7_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s18
-; GFX9-NEXT:    s_mov_b32 s9, s19
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 7, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -18375,8 +18490,7 @@ define void @s_shuffle_v2i64_v8i64__9_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -18458,8 +18572,7 @@ define void @s_shuffle_v2i64_v8i64__10_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s22
-; GFX942-NEXT:    s_mov_b32 s11, s23
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -18520,8 +18633,7 @@ define void @s_shuffle_v2i64_v8i64__11_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -18580,8 +18692,7 @@ define void @s_shuffle_v2i64_v8i64__12_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -18636,8 +18747,7 @@ define void @s_shuffle_v2i64_v8i64__13_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -18696,8 +18806,7 @@ define void @s_shuffle_v2i64_v8i64__14_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s22, s6
-; GFX942-NEXT:    s_mov_b32 s23, s7
+; GFX942-NEXT:    s_mov_b64 s[22:23], s[6:7]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18744,8 +18853,7 @@ define void @s_shuffle_v2i64_v8i64__u_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s8
-; GFX942-NEXT:    s_mov_b32 s11, s9
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -18757,18 +18865,43 @@ define void @s_shuffle_v2i64_v8i64__u_4() {
 }
 
 define void @s_shuffle_v2i64_v8i64__0_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s16
-; GFX9-NEXT:    s_mov_b32 s11, s17
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 0, i32 4>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -18776,20 +18909,48 @@ define void @s_shuffle_v2i64_v8i64__0_4() {
 }
 
 define void @s_shuffle_v2i64_v8i64__1_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__1_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 1, i32 4>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -18797,18 +18958,43 @@ define void @s_shuffle_v2i64_v8i64__1_4() {
 }
 
 define void @s_shuffle_v2i64_v8i64__2_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 2, i32 4>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -18816,90 +19002,145 @@ define void @s_shuffle_v2i64_v8i64__2_4() {
 }
 
 define void @s_shuffle_v2i64_v8i64__3_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__3_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__3_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <8 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 3, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v2i64_v8i64__4_4() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 3, i32 4>
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v2i64_v8i64__4_4() {
-; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_4:
+define void @s_shuffle_v2i64_v8i64__5_4() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
-; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
-; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_4:
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
-; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
-; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v2i64_v8i64__4_4:
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s8
-; GFX942-NEXT:    s_mov_b32 s11, s9
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 4, i32 4>
-  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v2i64_v8i64__5_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__5_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 5, i32 4>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -18943,8 +19184,7 @@ define void @s_shuffle_v2i64_v8i64__6_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s8
-; GFX942-NEXT:    s_mov_b32 s15, s9
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[8:9]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18958,20 +19198,48 @@ define void @s_shuffle_v2i64_v8i64__6_4() {
 }
 
 define void @s_shuffle_v2i64_v8i64__7_4() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__7_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s18
-; GFX9-NEXT:    s_mov_b32 s9, s19
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 7, i32 4>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -19011,8 +19279,7 @@ define void @s_shuffle_v2i64_v8i64__8_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s8
-; GFX942-NEXT:    s_mov_b32 s11, s9
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19096,10 +19363,8 @@ define void @s_shuffle_v2i64_v8i64__9_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19160,8 +19425,7 @@ define void @s_shuffle_v2i64_v8i64__10_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19246,10 +19510,8 @@ define void @s_shuffle_v2i64_v8i64__11_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19336,8 +19598,7 @@ define void @s_shuffle_v2i64_v8i64__12_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19422,10 +19683,8 @@ define void @s_shuffle_v2i64_v8i64__13_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s26
-; GFX942-NEXT:    s_mov_b32 s9, s27
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[26:27]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19512,8 +19771,7 @@ define void @s_shuffle_v2i64_v8i64__14_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s26, s8
-; GFX942-NEXT:    s_mov_b32 s27, s9
+; GFX942-NEXT:    s_mov_b64 s[26:27], s[8:9]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[24:25]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19572,18 +19830,43 @@ define void @s_shuffle_v2i64_v8i64__u_5() {
 }
 
 define void @s_shuffle_v2i64_v8i64__0_5() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s18
-; GFX9-NEXT:    s_mov_b32 s11, s19
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 0, i32 5>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -19627,8 +19910,7 @@ define void @s_shuffle_v2i64_v8i64__1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19640,18 +19922,43 @@ define void @s_shuffle_v2i64_v8i64__1_5() {
 }
 
 define void @s_shuffle_v2i64_v8i64__2_5() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 2, i32 5>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -19695,8 +20002,7 @@ define void @s_shuffle_v2i64_v8i64__3_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19788,8 +20094,7 @@ define void @s_shuffle_v2i64_v8i64__5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -19837,8 +20142,7 @@ define void @s_shuffle_v2i64_v8i64__6_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19888,8 +20192,7 @@ define void @s_shuffle_v2i64_v8i64__7_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20013,8 +20316,7 @@ define void @s_shuffle_v2i64_v8i64__9_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20069,8 +20371,7 @@ define void @s_shuffle_v2i64_v8i64__10_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s22
-; GFX942-NEXT:    s_mov_b32 s11, s23
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20151,8 +20452,7 @@ define void @s_shuffle_v2i64_v8i64__11_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20233,8 +20533,7 @@ define void @s_shuffle_v2i64_v8i64__12_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20315,8 +20614,7 @@ define void @s_shuffle_v2i64_v8i64__13_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20397,8 +20695,7 @@ define void @s_shuffle_v2i64_v8i64__14_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s26, s10
-; GFX942-NEXT:    s_mov_b32 s27, s11
+; GFX942-NEXT:    s_mov_b64 s[26:27], s[10:11]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[24:25]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20445,8 +20742,7 @@ define void @s_shuffle_v2i64_v8i64__u_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20458,18 +20754,43 @@ define void @s_shuffle_v2i64_v8i64__u_6() {
 }
 
 define void @s_shuffle_v2i64_v8i64__0_6() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s20
-; GFX9-NEXT:    s_mov_b32 s11, s21
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 0, i32 6>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -20511,35 +20832,58 @@ define void @s_shuffle_v2i64_v8i64__1_6() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <8 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 1, i32 6>
+  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v2i64_v8i64__2_6() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 1, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v2i64_v8i64__2_6() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s16
-; GFX9-NEXT:    s_mov_b32 s11, s17
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 2, i32 6>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -20583,10 +20927,8 @@ define void @s_shuffle_v2i64_v8i64__3_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20634,8 +20976,7 @@ define void @s_shuffle_v2i64_v8i64__4_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20683,10 +21024,8 @@ define void @s_shuffle_v2i64_v8i64__5_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20734,8 +21073,7 @@ define void @s_shuffle_v2i64_v8i64__6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20785,10 +21123,8 @@ define void @s_shuffle_v2i64_v8i64__7_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20832,8 +21168,7 @@ define void @s_shuffle_v2i64_v8i64__8_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -20941,10 +21276,8 @@ define void @s_shuffle_v2i64_v8i64__9_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -21005,8 +21338,7 @@ define void @s_shuffle_v2i64_v8i64__10_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -21115,10 +21447,8 @@ define void @s_shuffle_v2i64_v8i64__11_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -21229,8 +21559,7 @@ define void @s_shuffle_v2i64_v8i64__12_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -21339,10 +21668,8 @@ define void @s_shuffle_v2i64_v8i64__13_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s26
-; GFX942-NEXT:    s_mov_b32 s9, s27
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[26:27]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -21457,8 +21784,7 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s30, s12
-; GFX942-NEXT:    s_mov_b32 s31, s13
+; GFX942-NEXT:    s_mov_b64 s[30:31], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[30:31]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -21524,18 +21850,43 @@ define void @s_shuffle_v2i64_v8i64__u_7() {
 }
 
 define void @s_shuffle_v2i64_v8i64__0_7() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__0_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s22
-; GFX9-NEXT:    s_mov_b32 s11, s23
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s22
+; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s22
+; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__0_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 0, i32 7>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -21579,8 +21930,7 @@ define void @s_shuffle_v2i64_v8i64__1_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -21594,18 +21944,43 @@ define void @s_shuffle_v2i64_v8i64__1_7() {
 }
 
 define void @s_shuffle_v2i64_v8i64__2_7() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__2_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s18
-; GFX9-NEXT:    s_mov_b32 s11, s19
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__2_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> <i32 2, i32 7>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
@@ -21649,8 +22024,7 @@ define void @s_shuffle_v2i64_v8i64__3_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -21700,8 +22074,7 @@ define void @s_shuffle_v2i64_v8i64__4_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -21749,8 +22122,7 @@ define void @s_shuffle_v2i64_v8i64__5_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -21845,8 +22217,7 @@ define void @s_shuffle_v2i64_v8i64__7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -22001,8 +22372,7 @@ define void @s_shuffle_v2i64_v8i64__9_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s18
-; GFX942-NEXT:    s_mov_b32 s13, s19
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[18:19]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -22065,8 +22435,7 @@ define void @s_shuffle_v2i64_v8i64__10_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s22
-; GFX942-NEXT:    s_mov_b32 s11, s23
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22175,8 +22544,7 @@ define void @s_shuffle_v2i64_v8i64__11_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s22
-; GFX942-NEXT:    s_mov_b32 s13, s23
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[22:23]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -22289,8 +22657,7 @@ define void @s_shuffle_v2i64_v8i64__12_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22399,8 +22766,7 @@ define void @s_shuffle_v2i64_v8i64__13_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s26
-; GFX942-NEXT:    s_mov_b32 s13, s27
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[26:27]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -22517,8 +22883,7 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s30, s14
-; GFX942-NEXT:    s_mov_b32 s31, s15
+; GFX942-NEXT:    s_mov_b64 s[30:31], s[14:15]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[28:29]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[30:31]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -22625,8 +22990,7 @@ define void @s_shuffle_v2i64_v8i64__1_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22710,8 +23074,7 @@ define void @s_shuffle_v2i64_v8i64__3_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22799,8 +23162,7 @@ define void @s_shuffle_v2i64_v8i64__5_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22889,8 +23251,7 @@ define void @s_shuffle_v2i64_v8i64__7_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22952,10 +23313,8 @@ define void @s_shuffle_v2i64_v8i64__9_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -22968,18 +23327,43 @@ define void @s_shuffle_v2i64_v8i64__9_8() {
 }
 
 define void @s_shuffle_v2i64_v8i64__10_8() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_8:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s4
-; GFX9-NEXT:    s_mov_b32 s11, s5
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_8:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_8:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_8:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 10, i32 8>
@@ -23024,10 +23408,8 @@ define void @s_shuffle_v2i64_v8i64__11_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23076,8 +23458,7 @@ define void @s_shuffle_v2i64_v8i64__12_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23126,10 +23507,8 @@ define void @s_shuffle_v2i64_v8i64__13_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23178,8 +23557,7 @@ define void @s_shuffle_v2i64_v8i64__14_8() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -23277,8 +23655,7 @@ define void @s_shuffle_v2i64_v8i64__0_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23333,8 +23710,7 @@ define void @s_shuffle_v2i64_v8i64__1_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23389,8 +23765,7 @@ define void @s_shuffle_v2i64_v8i64__2_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23491,8 +23866,7 @@ define void @s_shuffle_v2i64_v8i64__3_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23573,8 +23947,7 @@ define void @s_shuffle_v2i64_v8i64__4_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23656,8 +24029,7 @@ define void @s_shuffle_v2i64_v8i64__5_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s26
-; GFX942-NEXT:    s_mov_b32 s9, s27
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23772,8 +24144,7 @@ define void @s_shuffle_v2i64_v8i64__6_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s18
-; GFX942-NEXT:    s_mov_b32 s15, s19
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[18:19]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -23836,8 +24207,7 @@ define void @s_shuffle_v2i64_v8i64__7_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s26
-; GFX942-NEXT:    s_mov_b32 s9, s27
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23891,18 +24261,43 @@ define void @s_shuffle_v2i64_v8i64__8_9() {
 }
 
 define void @s_shuffle_v2i64_v8i64__9_9() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__9_9:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_9:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_9:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_9:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 9, i32 9>
@@ -23911,18 +24306,43 @@ define void @s_shuffle_v2i64_v8i64__9_9() {
 }
 
 define void @s_shuffle_v2i64_v8i64__10_9() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_9:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_9:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_9:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_9:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 10, i32 9>
@@ -23931,18 +24351,43 @@ define void @s_shuffle_v2i64_v8i64__10_9() {
 }
 
 define void @s_shuffle_v2i64_v8i64__11_9() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__11_9:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_9:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_9:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_9:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 11, i32 9>
@@ -23987,8 +24432,7 @@ define void @s_shuffle_v2i64_v8i64__12_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24001,18 +24445,43 @@ define void @s_shuffle_v2i64_v8i64__12_9() {
 }
 
 define void @s_shuffle_v2i64_v8i64__13_9() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__13_9:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s18
-; GFX9-NEXT:    s_mov_b32 s9, s19
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_9:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_9:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_9:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 13, i32 9>
@@ -24057,8 +24526,7 @@ define void @s_shuffle_v2i64_v8i64__14_9() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -24105,8 +24573,7 @@ define void @s_shuffle_v2i64_v8i64__u_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24161,8 +24628,7 @@ define void @s_shuffle_v2i64_v8i64__0_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24221,10 +24687,8 @@ define void @s_shuffle_v2i64_v8i64__1_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24279,8 +24743,7 @@ define void @s_shuffle_v2i64_v8i64__2_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24339,10 +24802,8 @@ define void @s_shuffle_v2i64_v8i64__3_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24423,8 +24884,7 @@ define void @s_shuffle_v2i64_v8i64__4_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24501,13 +24961,11 @@ define void @s_shuffle_v2i64_v8i64__5_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24616,8 +25074,7 @@ define void @s_shuffle_v2i64_v8i64__6_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s20
-; GFX942-NEXT:    s_mov_b32 s15, s21
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[20:21]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -24734,10 +25191,8 @@ define void @s_shuffle_v2i64_v8i64__7_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24756,18 +25211,43 @@ define void @s_shuffle_v2i64_v8i64__7_10() {
 }
 
 define void @s_shuffle_v2i64_v8i64__8_10() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_10:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_10:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_10:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_10:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 8, i32 10>
@@ -24812,10 +25292,8 @@ define void @s_shuffle_v2i64_v8i64__9_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24828,18 +25306,43 @@ define void @s_shuffle_v2i64_v8i64__9_10() {
 }
 
 define void @s_shuffle_v2i64_v8i64__10_10() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_10:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_10:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_10:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_10:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 10, i32 10>
@@ -24884,10 +25387,8 @@ define void @s_shuffle_v2i64_v8i64__11_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24936,8 +25437,7 @@ define void @s_shuffle_v2i64_v8i64__12_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24986,10 +25486,8 @@ define void @s_shuffle_v2i64_v8i64__13_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25038,8 +25536,7 @@ define void @s_shuffle_v2i64_v8i64__14_10() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -25137,8 +25634,7 @@ define void @s_shuffle_v2i64_v8i64__0_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25239,8 +25735,7 @@ define void @s_shuffle_v2i64_v8i64__1_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25295,8 +25790,7 @@ define void @s_shuffle_v2i64_v8i64__2_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25378,8 +25872,7 @@ define void @s_shuffle_v2i64_v8i64__3_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25466,8 +25959,7 @@ define void @s_shuffle_v2i64_v8i64__4_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25522,8 +26014,7 @@ define void @s_shuffle_v2i64_v8i64__5_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25632,8 +26123,7 @@ define void @s_shuffle_v2i64_v8i64__6_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s22
-; GFX942-NEXT:    s_mov_b32 s15, s23
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[22:23]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -25696,8 +26186,7 @@ define void @s_shuffle_v2i64_v8i64__7_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25710,18 +26199,43 @@ define void @s_shuffle_v2i64_v8i64__7_11() {
 }
 
 define void @s_shuffle_v2i64_v8i64__8_11() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_11:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_11:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_11:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_11:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 8, i32 11>
@@ -25730,18 +26244,43 @@ define void @s_shuffle_v2i64_v8i64__8_11() {
 }
 
 define void @s_shuffle_v2i64_v8i64__9_11() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__9_11:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_11:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_11:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_11:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 9, i32 11>
@@ -25791,18 +26330,43 @@ define void @s_shuffle_v2i64_v8i64__10_11() {
 }
 
 define void @s_shuffle_v2i64_v8i64__11_11() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__11_11:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_11:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_11:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_11:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 11, i32 11>
@@ -25847,8 +26411,7 @@ define void @s_shuffle_v2i64_v8i64__12_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25861,18 +26424,43 @@ define void @s_shuffle_v2i64_v8i64__12_11() {
 }
 
 define void @s_shuffle_v2i64_v8i64__13_11() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__13_11:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_11:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_11:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_11:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 13, i32 11>
@@ -25917,8 +26505,7 @@ define void @s_shuffle_v2i64_v8i64__14_11() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -25965,8 +26552,7 @@ define void @s_shuffle_v2i64_v8i64__u_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s8
-; GFX942-NEXT:    s_mov_b32 s11, s9
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26021,8 +26607,7 @@ define void @s_shuffle_v2i64_v8i64__0_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26081,10 +26666,8 @@ define void @s_shuffle_v2i64_v8i64__1_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26139,8 +26722,7 @@ define void @s_shuffle_v2i64_v8i64__2_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26199,10 +26781,8 @@ define void @s_shuffle_v2i64_v8i64__3_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26283,8 +26863,7 @@ define void @s_shuffle_v2i64_v8i64__4_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26361,13 +26940,11 @@ define void @s_shuffle_v2i64_v8i64__5_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26476,8 +27053,7 @@ define void @s_shuffle_v2i64_v8i64__6_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s24
-; GFX942-NEXT:    s_mov_b32 s15, s25
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[24:25]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -26539,159 +27115,263 @@ define void @s_shuffle_v2i64_v8i64__7_12() {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_12:
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_12:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX90A-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX90A-NEXT:    v_writelane_b32 v0, s36, 0
+; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
+; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
+; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
+; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
+; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
+; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
+; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[36:51]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s44
+; GFX90A-NEXT:    s_mov_b32 s11, s45
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
+; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
+; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
+; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
+; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
+; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
+; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
+; GFX90A-NEXT:    v_readlane_b32 s36, v0, 0
+; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_12:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
+; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[24:25]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
+; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
+; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <8 x i64> asm "; def $0", "=s"()
+  %vec1 = call <8 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 7, i32 12>
+  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v2i64_v8i64__8_12() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_12:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_12:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_12:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <8 x i64> asm "; def $0", "=s"()
+  %vec1 = call <8 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 8, i32 12>
+  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v2i64_v8i64__9_12() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_12:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_12:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_12:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <8 x i64> asm "; def $0", "=s"()
+  %vec1 = call <8 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 9, i32 12>
+  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v2i64_v8i64__10_12() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_12:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_12:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_12:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <8 x i64> asm "; def $0", "=s"()
+  %vec1 = call <8 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 10, i32 12>
+  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v2i64_v8i64__11_12() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_12:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_12:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX90A-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX90A-NEXT:    v_writelane_b32 v0, s36, 0
-; GFX90A-NEXT:    v_writelane_b32 v0, s37, 1
-; GFX90A-NEXT:    v_writelane_b32 v0, s38, 2
-; GFX90A-NEXT:    v_writelane_b32 v0, s39, 3
-; GFX90A-NEXT:    v_writelane_b32 v0, s48, 4
-; GFX90A-NEXT:    v_writelane_b32 v0, s49, 5
-; GFX90A-NEXT:    v_writelane_b32 v0, s50, 6
-; GFX90A-NEXT:    v_writelane_b32 v0, s51, 7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[36:51]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s44
-; GFX90A-NEXT:    s_mov_b32 s11, s45
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_readlane_b32 s51, v0, 7
-; GFX90A-NEXT:    v_readlane_b32 s50, v0, 6
-; GFX90A-NEXT:    v_readlane_b32 s49, v0, 5
-; GFX90A-NEXT:    v_readlane_b32 s48, v0, 4
-; GFX90A-NEXT:    v_readlane_b32 s39, v0, 3
-; GFX90A-NEXT:    v_readlane_b32 s38, v0, 2
-; GFX90A-NEXT:    v_readlane_b32 s37, v0, 1
-; GFX90A-NEXT:    v_readlane_b32 s36, v0, 0
-; GFX90A-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX90A-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_12:
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__11_12:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
-; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s24
-; GFX942-NEXT:    s_mov_b32 s11, s25
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %vec1 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 7, i32 12>
-  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v2i64_v8i64__8_12() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_12:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s16
-; GFX9-NEXT:    s_mov_b32 s11, s17
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %vec1 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 8, i32 12>
-  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v2i64_v8i64__9_12() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__9_12:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %vec1 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 9, i32 12>
-  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v2i64_v8i64__10_12() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_12:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %vec1 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 10, i32 12>
-  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v2i64_v8i64__11_12() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__11_12:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 11, i32 12>
@@ -26736,8 +27416,7 @@ define void @s_shuffle_v2i64_v8i64__12_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s8
-; GFX942-NEXT:    s_mov_b32 s11, s9
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26750,20 +27429,48 @@ define void @s_shuffle_v2i64_v8i64__12_12() {
 }
 
 define void @s_shuffle_v2i64_v8i64__13_12() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__13_12:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_12:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_12:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_12:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 13, i32 12>
@@ -26808,8 +27515,7 @@ define void @s_shuffle_v2i64_v8i64__14_12() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s8
-; GFX942-NEXT:    s_mov_b32 s15, s9
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[8:9]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -26911,8 +27617,7 @@ define void @s_shuffle_v2i64_v8i64__0_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s22
-; GFX942-NEXT:    s_mov_b32 s11, s23
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -26975,8 +27680,7 @@ define void @s_shuffle_v2i64_v8i64__1_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27037,8 +27741,7 @@ define void @s_shuffle_v2i64_v8i64__2_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s22
-; GFX942-NEXT:    s_mov_b32 s11, s23
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27097,8 +27800,7 @@ define void @s_shuffle_v2i64_v8i64__3_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27179,8 +27881,7 @@ define void @s_shuffle_v2i64_v8i64__4_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s22
-; GFX942-NEXT:    s_mov_b32 s11, s23
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27261,8 +27962,7 @@ define void @s_shuffle_v2i64_v8i64__5_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27371,8 +28071,7 @@ define void @s_shuffle_v2i64_v8i64__6_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s26
-; GFX942-NEXT:    s_mov_b32 s15, s27
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[26:27]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -27485,8 +28184,7 @@ define void @s_shuffle_v2i64_v8i64__7_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27499,18 +28197,43 @@ define void @s_shuffle_v2i64_v8i64__7_13() {
 }
 
 define void @s_shuffle_v2i64_v8i64__8_13() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_13:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s18
-; GFX9-NEXT:    s_mov_b32 s11, s19
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_13:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_13:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_13:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 8, i32 13>
@@ -27525,62 +28248,86 @@ define void @s_shuffle_v2i64_v8i64__9_13() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
-; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX900-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_13:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_13:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <8 x i64> asm "; def $0", "=s"()
+  %vec1 = call <8 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 9, i32 13>
+  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v2i64_v8i64__10_13() {
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_13:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_13:
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_13:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b64 s[8:9], s[12:13]
-; GFX90A-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v2i64_v8i64__9_13:
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_13:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <8 x i64> asm "; def $0", "=s"()
-  %vec1 = call <8 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 9, i32 13>
-  call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v2i64_v8i64__10_13() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_13:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 10, i32 13>
@@ -27625,8 +28372,7 @@ define void @s_shuffle_v2i64_v8i64__11_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27720,8 +28466,7 @@ define void @s_shuffle_v2i64_v8i64__13_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27770,8 +28515,7 @@ define void @s_shuffle_v2i64_v8i64__14_13() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -27818,8 +28562,7 @@ define void @s_shuffle_v2i64_v8i64__u_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27874,8 +28617,7 @@ define void @s_shuffle_v2i64_v8i64__0_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s24
-; GFX942-NEXT:    s_mov_b32 s11, s25
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[24:25]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27934,10 +28676,8 @@ define void @s_shuffle_v2i64_v8i64__1_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -27992,8 +28732,7 @@ define void @s_shuffle_v2i64_v8i64__2_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s24
-; GFX942-NEXT:    s_mov_b32 s11, s25
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[24:25]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28052,10 +28791,8 @@ define void @s_shuffle_v2i64_v8i64__3_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28136,8 +28873,7 @@ define void @s_shuffle_v2i64_v8i64__4_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s24
-; GFX942-NEXT:    s_mov_b32 s11, s25
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[24:25]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28214,13 +28950,11 @@ define void @s_shuffle_v2i64_v8i64__5_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s24
-; GFX942-NEXT:    s_mov_b32 s11, s25
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[24:25]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28329,8 +29063,7 @@ define void @s_shuffle_v2i64_v8i64__6_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s28
-; GFX942-NEXT:    s_mov_b32 s15, s29
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[28:29]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -28447,10 +29180,8 @@ define void @s_shuffle_v2i64_v8i64__7_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s28
-; GFX942-NEXT:    s_mov_b32 s11, s29
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[28:29]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28469,18 +29200,43 @@ define void @s_shuffle_v2i64_v8i64__7_14() {
 }
 
 define void @s_shuffle_v2i64_v8i64__8_14() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_14:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s20
-; GFX9-NEXT:    s_mov_b32 s11, s21
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_14:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_14:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_14:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 8, i32 14>
@@ -28525,10 +29281,8 @@ define void @s_shuffle_v2i64_v8i64__9_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28541,18 +29295,43 @@ define void @s_shuffle_v2i64_v8i64__9_14() {
 }
 
 define void @s_shuffle_v2i64_v8i64__10_14() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_14:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s16
-; GFX9-NEXT:    s_mov_b32 s11, s17
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_14:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_14:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_14:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 10, i32 14>
@@ -28597,10 +29376,8 @@ define void @s_shuffle_v2i64_v8i64__11_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28649,8 +29426,7 @@ define void @s_shuffle_v2i64_v8i64__12_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28699,10 +29475,8 @@ define void @s_shuffle_v2i64_v8i64__13_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28751,8 +29525,7 @@ define void @s_shuffle_v2i64_v8i64__14_14() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -28855,8 +29628,7 @@ define void @s_shuffle_v2i64_v8i64__0_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s26
-; GFX942-NEXT:    s_mov_b32 s11, s27
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -28908,20 +29680,29 @@ define void @s_shuffle_v2i64_v8i64__1_15() {
 ; GFX942-LABEL: s_shuffle_v2i64_v8i64__1_15:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
+; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s16, s2
-; GFX942-NEXT:    s_mov_b32 s17, s3
-; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
-; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
+; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
+; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
+; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
@@ -28973,8 +29754,7 @@ define void @s_shuffle_v2i64_v8i64__2_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s26
-; GFX942-NEXT:    s_mov_b32 s11, s27
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -29027,16 +29807,15 @@ define void @s_shuffle_v2i64_v8i64__3_15() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s20, s6
-; GFX942-NEXT:    s_mov_b32 s21, s7
-; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
-; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -29117,8 +29896,7 @@ define void @s_shuffle_v2i64_v8i64__4_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:27]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s26
-; GFX942-NEXT:    s_mov_b32 s11, s27
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[26:27]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -29193,16 +29971,15 @@ define void @s_shuffle_v2i64_v8i64__5_15() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:15]
+; GFX942-NEXT:    ; def s[8:23]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:27]
+; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s24, s10
-; GFX942-NEXT:    s_mov_b32 s25, s11
-; GFX942-NEXT:    s_mov_b64 s[8:9], s[24:25]
-; GFX942-NEXT:    s_mov_b64 s[10:11], s[26:27]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -29311,8 +30088,7 @@ define void @s_shuffle_v2i64_v8i64__6_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:31]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s30
-; GFX942-NEXT:    s_mov_b32 s15, s31
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[30:31]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -29418,30 +30194,19 @@ define void @s_shuffle_v2i64_v8i64__7_15() {
 ; GFX942-LABEL: s_shuffle_v2i64_v8i64__7_15:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX942-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:31]
+; GFX942-NEXT:    ; def s[4:19]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s28, s14
-; GFX942-NEXT:    s_mov_b32 s29, s15
-; GFX942-NEXT:    s_mov_b64 s[8:9], s[28:29]
-; GFX942-NEXT:    s_mov_b64 s[10:11], s[30:31]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX942-NEXT:    v_readlane_b32 s30, v0, 0
-; GFX942-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
-; GFX942-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
@@ -29451,18 +30216,43 @@ define void @s_shuffle_v2i64_v8i64__7_15() {
 }
 
 define void @s_shuffle_v2i64_v8i64__8_15() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_15:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:23]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s22
-; GFX9-NEXT:    s_mov_b32 s11, s23
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_15:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s22
+; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_15:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s22
+; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_15:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 8, i32 15>
@@ -29507,8 +30297,7 @@ define void @s_shuffle_v2i64_v8i64__9_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -29523,18 +30312,43 @@ define void @s_shuffle_v2i64_v8i64__9_15() {
 }
 
 define void @s_shuffle_v2i64_v8i64__10_15() {
-; GFX9-LABEL: s_shuffle_v2i64_v8i64__10_15:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:19]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s18
-; GFX9-NEXT:    s_mov_b32 s11, s19
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_15:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_15:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i64_v8i64__10_15:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <8 x i64> asm "; def $0", "=s"()
   %vec1 = call <8 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> <i32 10, i32 15>
@@ -29579,8 +30393,7 @@ define void @s_shuffle_v2i64_v8i64__11_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -29631,8 +30444,7 @@ define void @s_shuffle_v2i64_v8i64__12_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -29681,8 +30493,7 @@ define void @s_shuffle_v2i64_v8i64__13_15() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
 ; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll
index 7f8f2dbbb09a1..54e700625d72c 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll
@@ -88,8 +88,7 @@ define void @v_shuffle_v2p0_v2p0__1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -144,8 +143,7 @@ define void @v_shuffle_v2p0_v2p0__3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -204,10 +202,8 @@ define void @v_shuffle_v2p0_v2p0__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -262,8 +258,7 @@ define void @v_shuffle_v2p0_v2p0__3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -309,8 +304,7 @@ define void @v_shuffle_v2p0_v2p0__3_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -355,8 +349,7 @@ define void @v_shuffle_v2p0_v2p0__3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -401,8 +394,7 @@ define void @v_shuffle_v2p0_v2p0__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -446,8 +438,7 @@ define void @v_shuffle_v2p0_v2p0__0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -492,8 +483,7 @@ define void @v_shuffle_v2p0_v2p0__1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -537,8 +527,7 @@ define void @v_shuffle_v2p0_v2p0__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -660,8 +649,7 @@ define void @v_shuffle_v2p0_v2p0__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -794,8 +782,7 @@ define void @v_shuffle_v2p0_v2p0__1_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -900,8 +887,7 @@ define void @v_shuffle_v2p0_v2p0__0_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -956,8 +942,7 @@ define void @v_shuffle_v2p0_v2p0__1_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1095,8 +1080,7 @@ define void @s_shuffle_v2p0_v2p0__1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1154,8 +1138,7 @@ define void @s_shuffle_v2p0_v2p0__3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1213,10 +1196,8 @@ define void @s_shuffle_v2p0_v2p0__3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1270,8 +1251,7 @@ define void @s_shuffle_v2p0_v2p0__3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1320,10 +1300,8 @@ define void @s_shuffle_v2p0_v2p0__3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1336,18 +1314,43 @@ define void @s_shuffle_v2p0_v2p0__3_2() {
 }
 
 define void @s_shuffle_v2p0_v2p0__3_3() {
-; GFX9-LABEL: s_shuffle_v2p0_v2p0__3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v2p0__3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v2p0__3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v2p0__3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> <i32 3, i32 3>
@@ -1388,8 +1391,7 @@ define void @s_shuffle_v2p0_v2p0__u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1401,18 +1403,43 @@ define void @s_shuffle_v2p0_v2p0__u_0() {
 }
 
 define void @s_shuffle_v2p0_v2p0__0_0() {
-; GFX9-LABEL: s_shuffle_v2p0_v2p0__0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v2p0__0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v2p0__0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v2p0__0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -1456,10 +1483,8 @@ define void @s_shuffle_v2p0_v2p0__1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1503,8 +1528,7 @@ define void @s_shuffle_v2p0_v2p0__2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1596,18 +1620,43 @@ define void @s_shuffle_v2p0_v2p0__0_1() {
 }
 
 define void @s_shuffle_v2p0_v2p0__1_1() {
-; GFX9-LABEL: s_shuffle_v2p0_v2p0__1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v2p0__1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v2p0__1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v2p0__1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> <i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -1741,8 +1790,7 @@ define void @s_shuffle_v2p0_v2p0__1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1850,8 +1898,7 @@ define void @s_shuffle_v2p0_v2p0__0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -1905,8 +1952,7 @@ define void @s_shuffle_v2p0_v2p0__1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll
index 27a6cf11c4cb1..9c770bf1c77cc 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll
@@ -127,8 +127,7 @@ define void @v_shuffle_v2p0_v3p0__2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -223,8 +222,7 @@ define void @v_shuffle_v2p0_v3p0__5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -283,10 +281,8 @@ define void @v_shuffle_v2p0_v3p0__5_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -341,8 +337,7 @@ define void @v_shuffle_v2p0_v3p0__5_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v8
-; GFX942-NEXT:    v_mov_b32_e32 v1, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -397,8 +392,7 @@ define void @v_shuffle_v2p0_v3p0__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v10
-; GFX942-NEXT:    v_mov_b32_e32 v3, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -447,10 +441,8 @@ define void @v_shuffle_v2p0_v3p0__5_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -495,8 +487,7 @@ define void @v_shuffle_v2p0_v3p0__5_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -541,8 +532,7 @@ define void @v_shuffle_v2p0_v3p0__5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -587,8 +577,7 @@ define void @v_shuffle_v2p0_v3p0__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -632,8 +621,7 @@ define void @v_shuffle_v2p0_v3p0__0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -677,8 +665,7 @@ define void @v_shuffle_v2p0_v3p0__1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -726,10 +713,8 @@ define void @v_shuffle_v2p0_v3p0__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -773,8 +758,7 @@ define void @v_shuffle_v2p0_v3p0__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -828,8 +812,7 @@ define void @v_shuffle_v2p0_v3p0__4_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -952,8 +935,7 @@ define void @v_shuffle_v2p0_v3p0__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -997,8 +979,7 @@ define void @v_shuffle_v2p0_v3p0__2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1091,8 +1072,7 @@ define void @v_shuffle_v2p0_v3p0__4_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v2
-; GFX942-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1176,8 +1156,7 @@ define void @v_shuffle_v2p0_v3p0__0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1260,8 +1239,7 @@ define void @v_shuffle_v2p0_v3p0__2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1354,8 +1332,7 @@ define void @v_shuffle_v2p0_v3p0__4_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v4
-; GFX942-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1489,8 +1466,7 @@ define void @v_shuffle_v2p0_v3p0__2_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1545,8 +1521,7 @@ define void @v_shuffle_v2p0_v3p0__4_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1641,8 +1616,7 @@ define void @v_shuffle_v2p0_v3p0__0_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1697,8 +1671,7 @@ define void @v_shuffle_v2p0_v3p0__1_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1753,8 +1726,7 @@ define void @v_shuffle_v2p0_v3p0__2_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v4
-; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1839,8 +1811,7 @@ define void @v_shuffle_v2p0_v3p0__4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1935,8 +1906,7 @@ define void @v_shuffle_v2p0_v3p0__0_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1991,8 +1961,7 @@ define void @v_shuffle_v2p0_v3p0__1_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v8
-; GFX942-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2047,8 +2016,7 @@ define void @v_shuffle_v2p0_v3p0__2_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v4
-; GFX942-NEXT:    v_mov_b32_e32 v9, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2093,8 +2061,7 @@ define void @v_shuffle_v2p0_v3p0__3_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2232,8 +2199,7 @@ define void @s_shuffle_v2p0_v3p0__1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2273,8 +2239,7 @@ define void @s_shuffle_v2p0_v3p0__2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2332,8 +2297,7 @@ define void @s_shuffle_v2p0_v3p0__4_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2374,8 +2338,7 @@ define void @s_shuffle_v2p0_v3p0__5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2430,11 +2393,11 @@ define void @s_shuffle_v2p0_v3p0__5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2484,8 +2447,7 @@ define void @s_shuffle_v2p0_v3p0__5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2539,10 +2501,8 @@ define void @s_shuffle_v2p0_v3p0__5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2587,10 +2547,8 @@ define void @s_shuffle_v2p0_v3p0__5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2603,18 +2561,43 @@ define void @s_shuffle_v2p0_v3p0__5_3() {
 }
 
 define void @s_shuffle_v2p0_v3p0__5_4() {
-; GFX9-LABEL: s_shuffle_v2p0_v3p0__5_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v3p0__5_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v3p0__5_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v3p0__5_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> <i32 5, i32 4>
@@ -2659,10 +2642,8 @@ define void @s_shuffle_v2p0_v3p0__5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2707,8 +2688,7 @@ define void @s_shuffle_v2p0_v3p0__u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2720,18 +2700,43 @@ define void @s_shuffle_v2p0_v3p0__u_0() {
 }
 
 define void @s_shuffle_v2p0_v3p0__0_0() {
-; GFX9-LABEL: s_shuffle_v2p0_v3p0__0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v3p0__0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v3p0__0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v3p0__0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -2775,10 +2780,8 @@ define void @s_shuffle_v2p0_v3p0__1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2822,10 +2825,8 @@ define void @s_shuffle_v2p0_v3p0__2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2869,8 +2870,7 @@ define void @s_shuffle_v2p0_v3p0__3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2924,13 +2924,11 @@ define void @s_shuffle_v2p0_v3p0__4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3023,18 +3021,43 @@ define void @s_shuffle_v2p0_v3p0__0_1() {
 }
 
 define void @s_shuffle_v2p0_v3p0__1_1() {
-; GFX9-LABEL: s_shuffle_v2p0_v3p0__1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v3p0__1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v3p0__1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v3p0__1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> <i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -3042,18 +3065,43 @@ define void @s_shuffle_v2p0_v3p0__1_1() {
 }
 
 define void @s_shuffle_v2p0_v3p0__2_1() {
-; GFX9-LABEL: s_shuffle_v2p0_v3p0__2_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v3p0__2_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v3p0__2_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v3p0__2_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> <i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -3142,8 +3190,7 @@ define void @s_shuffle_v2p0_v3p0__4_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3188,8 +3235,7 @@ define void @s_shuffle_v2p0_v3p0__u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3201,18 +3247,43 @@ define void @s_shuffle_v2p0_v3p0__u_2() {
 }
 
 define void @s_shuffle_v2p0_v3p0__0_2() {
-; GFX9-LABEL: s_shuffle_v2p0_v3p0__0_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v3p0__0_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v3p0__0_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v3p0__0_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> <i32 0, i32 2>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -3256,10 +3327,8 @@ define void @s_shuffle_v2p0_v3p0__1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3307,10 +3376,8 @@ define void @s_shuffle_v2p0_v3p0__2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3354,8 +3421,7 @@ define void @s_shuffle_v2p0_v3p0__3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3412,10 +3478,8 @@ define void @s_shuffle_v2p0_v3p0__4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3514,8 +3578,7 @@ define void @s_shuffle_v2p0_v3p0__1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3555,8 +3618,7 @@ define void @s_shuffle_v2p0_v3p0__2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3618,10 +3680,8 @@ define void @s_shuffle_v2p0_v3p0__4_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3716,8 +3776,7 @@ define void @s_shuffle_v2p0_v3p0__0_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3771,8 +3830,7 @@ define void @s_shuffle_v2p0_v3p0__1_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3826,8 +3884,7 @@ define void @s_shuffle_v2p0_v3p0__2_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3881,18 +3938,43 @@ define void @s_shuffle_v2p0_v3p0__3_4() {
 }
 
 define void @s_shuffle_v2p0_v3p0__4_4() {
-; GFX9-LABEL: s_shuffle_v2p0_v3p0__4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v3p0__4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v3p0__4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v3p0__4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> <i32 4, i32 4>
@@ -3933,8 +4015,7 @@ define void @s_shuffle_v2p0_v3p0__u_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3988,8 +4069,7 @@ define void @s_shuffle_v2p0_v3p0__0_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4047,10 +4127,8 @@ define void @s_shuffle_v2p0_v3p0__1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4104,10 +4182,8 @@ define void @s_shuffle_v2p0_v3p0__2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4120,18 +4196,43 @@ define void @s_shuffle_v2p0_v3p0__2_5() {
 }
 
 define void @s_shuffle_v2p0_v3p0__3_5() {
-; GFX9-LABEL: s_shuffle_v2p0_v3p0__3_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v3p0__3_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v3p0__3_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v3p0__3_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> <i32 3, i32 5>
@@ -4176,10 +4277,8 @@ define void @s_shuffle_v2p0_v3p0__4_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll
index ae31524ebaa7f..47634638d7674 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll
@@ -166,8 +166,7 @@ define void @v_shuffle_v2p0_v4p0__3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -302,8 +301,7 @@ define void @v_shuffle_v2p0_v4p0__7_u(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -362,10 +360,8 @@ define void @v_shuffle_v2p0_v4p0__7_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v8
-; GFX942-NEXT:    v_mov_b32_e32 v3, v9
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -420,8 +416,7 @@ define void @v_shuffle_v2p0_v4p0__7_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v10
-; GFX942-NEXT:    v_mov_b32_e32 v1, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -476,8 +471,7 @@ define void @v_shuffle_v2p0_v4p0__7_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v12
-; GFX942-NEXT:    v_mov_b32_e32 v3, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -532,8 +526,7 @@ define void @v_shuffle_v2p0_v4p0__7_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v14
-; GFX942-NEXT:    v_mov_b32_e32 v5, v15
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[14:15]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -582,10 +575,8 @@ define void @v_shuffle_v2p0_v4p0__7_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -630,8 +621,7 @@ define void @v_shuffle_v2p0_v4p0__7_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -676,8 +666,7 @@ define void @v_shuffle_v2p0_v4p0__7_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -722,8 +711,7 @@ define void @v_shuffle_v2p0_v4p0__7_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -768,8 +756,7 @@ define void @v_shuffle_v2p0_v4p0__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -813,8 +800,7 @@ define void @v_shuffle_v2p0_v4p0__0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -858,8 +844,7 @@ define void @v_shuffle_v2p0_v4p0__1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -903,8 +888,7 @@ define void @v_shuffle_v2p0_v4p0__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -952,10 +936,8 @@ define void @v_shuffle_v2p0_v4p0__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -999,8 +981,7 @@ define void @v_shuffle_v2p0_v4p0__4_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1054,8 +1035,7 @@ define void @v_shuffle_v2p0_v4p0__5_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1110,8 +1090,7 @@ define void @v_shuffle_v2p0_v4p0__6_0(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v0
-; GFX942-NEXT:    v_mov_b32_e32 v9, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1234,8 +1213,7 @@ define void @v_shuffle_v2p0_v4p0__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1279,8 +1257,7 @@ define void @v_shuffle_v2p0_v4p0__2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v2
-; GFX942-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1324,8 +1301,7 @@ define void @v_shuffle_v2p0_v4p0__3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1418,8 +1394,7 @@ define void @v_shuffle_v2p0_v4p0__5_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v2
-; GFX942-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[6:9], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1474,8 +1449,7 @@ define void @v_shuffle_v2p0_v4p0__6_1(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v2
-; GFX942-NEXT:    v_mov_b32_e32 v11, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1559,8 +1533,7 @@ define void @v_shuffle_v2p0_v4p0__0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1643,8 +1616,7 @@ define void @v_shuffle_v2p0_v4p0__2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v4
-; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1688,8 +1660,7 @@ define void @v_shuffle_v2p0_v4p0__3_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1782,8 +1753,7 @@ define void @v_shuffle_v2p0_v4p0__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v4
-; GFX942-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1838,8 +1808,7 @@ define void @v_shuffle_v2p0_v4p0__6_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v4
-; GFX942-NEXT:    v_mov_b32_e32 v13, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1923,8 +1892,7 @@ define void @v_shuffle_v2p0_v4p0__0_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1968,8 +1936,7 @@ define void @v_shuffle_v2p0_v4p0__1_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2052,8 +2019,7 @@ define void @v_shuffle_v2p0_v4p0__3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2146,8 +2112,7 @@ define void @v_shuffle_v2p0_v4p0__5_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v6
-; GFX942-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2202,8 +2167,7 @@ define void @v_shuffle_v2p0_v4p0__6_3(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v14, v6
-; GFX942-NEXT:    v_mov_b32_e32 v15, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2376,8 +2340,7 @@ define void @v_shuffle_v2p0_v4p0__3_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2432,8 +2395,7 @@ define void @v_shuffle_v2p0_v4p0__5_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2478,8 +2440,7 @@ define void @v_shuffle_v2p0_v4p0__6_4(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2574,8 +2535,7 @@ define void @v_shuffle_v2p0_v4p0__0_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2630,8 +2590,7 @@ define void @v_shuffle_v2p0_v4p0__1_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2686,8 +2645,7 @@ define void @v_shuffle_v2p0_v4p0__2_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v8
-; GFX942-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2742,8 +2700,7 @@ define void @v_shuffle_v2p0_v4p0__3_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v8, v6
-; GFX942-NEXT:    v_mov_b32_e32 v9, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2828,8 +2785,7 @@ define void @v_shuffle_v2p0_v4p0__5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2874,8 +2830,7 @@ define void @v_shuffle_v2p0_v4p0__6_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v2
-; GFX942-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[2:3]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -2970,8 +2925,7 @@ define void @v_shuffle_v2p0_v4p0__0_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3026,8 +2980,7 @@ define void @v_shuffle_v2p0_v4p0__1_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v8
-; GFX942-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3082,8 +3035,7 @@ define void @v_shuffle_v2p0_v4p0__2_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v10
-; GFX942-NEXT:    v_mov_b32_e32 v7, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3138,8 +3090,7 @@ define void @v_shuffle_v2p0_v4p0__3_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v10, v6
-; GFX942-NEXT:    v_mov_b32_e32 v11, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[10:13], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3184,8 +3135,7 @@ define void @v_shuffle_v2p0_v4p0__4_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3270,8 +3220,7 @@ define void @v_shuffle_v2p0_v4p0__6_6(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v4
-; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3366,8 +3315,7 @@ define void @v_shuffle_v2p0_v4p0__0_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[2:9]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v8
-; GFX942-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
 ; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3422,8 +3370,7 @@ define void @v_shuffle_v2p0_v4p0__1_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v10
-; GFX942-NEXT:    v_mov_b32_e32 v5, v11
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[10:11]
 ; GFX942-NEXT:    global_store_dwordx4 v12, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3478,8 +3425,7 @@ define void @v_shuffle_v2p0_v4p0__2_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[6:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v6, v12
-; GFX942-NEXT:    v_mov_b32_e32 v7, v13
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[12:13]
 ; GFX942-NEXT:    global_store_dwordx4 v14, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3534,8 +3480,7 @@ define void @v_shuffle_v2p0_v4p0__3_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v12, v6
-; GFX942-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3580,8 +3525,7 @@ define void @v_shuffle_v2p0_v4p0__4_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v6
-; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3626,8 +3570,7 @@ define void @v_shuffle_v2p0_v4p0__5_7(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3765,8 +3708,7 @@ define void @s_shuffle_v2p0_v4p0__1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3850,8 +3792,7 @@ define void @s_shuffle_v2p0_v4p0__3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3909,8 +3850,7 @@ define void @s_shuffle_v2p0_v4p0__5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3996,8 +3936,7 @@ define void @s_shuffle_v2p0_v4p0__7_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4056,10 +3995,8 @@ define void @s_shuffle_v2p0_v4p0__7_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4113,8 +4050,7 @@ define void @s_shuffle_v2p0_v4p0__7_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4172,10 +4108,8 @@ define void @s_shuffle_v2p0_v4p0__7_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4230,8 +4164,7 @@ define void @s_shuffle_v2p0_v4p0__7_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4280,10 +4213,8 @@ define void @s_shuffle_v2p0_v4p0__7_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4296,18 +4227,43 @@ define void @s_shuffle_v2p0_v4p0__7_4() {
 }
 
 define void @s_shuffle_v2p0_v4p0__7_5() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__7_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__7_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__7_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__7_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 7, i32 5>
@@ -4352,10 +4308,8 @@ define void @s_shuffle_v2p0_v4p0__7_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4368,18 +4322,43 @@ define void @s_shuffle_v2p0_v4p0__7_6() {
 }
 
 define void @s_shuffle_v2p0_v4p0__7_7() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 7, i32 7>
@@ -4420,8 +4399,7 @@ define void @s_shuffle_v2p0_v4p0__u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4433,18 +4411,43 @@ define void @s_shuffle_v2p0_v4p0__u_0() {
 }
 
 define void @s_shuffle_v2p0_v4p0__0_0() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -4488,10 +4491,8 @@ define void @s_shuffle_v2p0_v4p0__1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4503,18 +4504,43 @@ define void @s_shuffle_v2p0_v4p0__1_0() {
 }
 
 define void @s_shuffle_v2p0_v4p0__2_0() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__2_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s4
-; GFX9-NEXT:    s_mov_b32 s11, s5
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__2_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__2_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__2_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 2, i32 0>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -4558,10 +4584,8 @@ define void @s_shuffle_v2p0_v4p0__3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4605,8 +4629,7 @@ define void @s_shuffle_v2p0_v4p0__4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4664,10 +4687,8 @@ define void @s_shuffle_v2p0_v4p0__5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4722,8 +4743,7 @@ define void @s_shuffle_v2p0_v4p0__6_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4816,18 +4836,43 @@ define void @s_shuffle_v2p0_v4p0__0_1() {
 }
 
 define void @s_shuffle_v2p0_v4p0__1_1() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -4835,18 +4880,43 @@ define void @s_shuffle_v2p0_v4p0__1_1() {
 }
 
 define void @s_shuffle_v2p0_v4p0__2_1() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__2_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__2_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__2_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__2_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -4854,18 +4924,43 @@ define void @s_shuffle_v2p0_v4p0__2_1() {
 }
 
 define void @s_shuffle_v2p0_v4p0__3_1() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__3_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__3_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__3_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__3_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 3, i32 1>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -4954,8 +5049,7 @@ define void @s_shuffle_v2p0_v4p0__5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5010,8 +5104,7 @@ define void @s_shuffle_v2p0_v4p0__6_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5056,8 +5149,7 @@ define void @s_shuffle_v2p0_v4p0__u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5069,18 +5161,43 @@ define void @s_shuffle_v2p0_v4p0__u_2() {
 }
 
 define void @s_shuffle_v2p0_v4p0__0_2() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__0_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__0_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__0_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__0_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 0, i32 2>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -5124,10 +5241,8 @@ define void @s_shuffle_v2p0_v4p0__1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5139,18 +5254,43 @@ define void @s_shuffle_v2p0_v4p0__1_2() {
 }
 
 define void @s_shuffle_v2p0_v4p0__2_2() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -5194,10 +5334,8 @@ define void @s_shuffle_v2p0_v4p0__3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5241,8 +5379,7 @@ define void @s_shuffle_v2p0_v4p0__4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5299,10 +5436,8 @@ define void @s_shuffle_v2p0_v4p0__5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5357,8 +5492,7 @@ define void @s_shuffle_v2p0_v4p0__6_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5411,18 +5545,43 @@ define void @s_shuffle_v2p0_v4p0__u_3() {
 }
 
 define void @s_shuffle_v2p0_v4p0__0_3() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__0_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__0_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__0_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__0_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 0, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -5430,18 +5589,43 @@ define void @s_shuffle_v2p0_v4p0__0_3() {
 }
 
 define void @s_shuffle_v2p0_v4p0__1_3() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__1_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__1_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__1_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__1_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 1, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -5489,18 +5673,43 @@ define void @s_shuffle_v2p0_v4p0__2_3() {
 }
 
 define void @s_shuffle_v2p0_v4p0__3_3() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> <i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x ptr> %shuf)
@@ -5590,8 +5799,7 @@ define void @s_shuffle_v2p0_v4p0__5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5646,8 +5854,7 @@ define void @s_shuffle_v2p0_v4p0__6_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5746,8 +5953,7 @@ define void @s_shuffle_v2p0_v4p0__1_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5831,8 +6037,7 @@ define void @s_shuffle_v2p0_v4p0__3_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5894,10 +6099,8 @@ define void @s_shuffle_v2p0_v4p0__5_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5910,18 +6113,43 @@ define void @s_shuffle_v2p0_v4p0__5_4() {
 }
 
 define void @s_shuffle_v2p0_v4p0__6_4() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__6_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s4
-; GFX9-NEXT:    s_mov_b32 s11, s5
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__6_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__6_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__6_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 6, i32 4>
@@ -6012,8 +6240,7 @@ define void @s_shuffle_v2p0_v4p0__0_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6067,8 +6294,7 @@ define void @s_shuffle_v2p0_v4p0__1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6123,8 +6349,7 @@ define void @s_shuffle_v2p0_v4p0__2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6178,8 +6403,7 @@ define void @s_shuffle_v2p0_v4p0__3_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6233,18 +6457,43 @@ define void @s_shuffle_v2p0_v4p0__4_5() {
 }
 
 define void @s_shuffle_v2p0_v4p0__5_5() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 5, i32 5>
@@ -6253,18 +6502,43 @@ define void @s_shuffle_v2p0_v4p0__5_5() {
 }
 
 define void @s_shuffle_v2p0_v4p0__6_5() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__6_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__6_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__6_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__6_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 6, i32 5>
@@ -6305,8 +6579,7 @@ define void @s_shuffle_v2p0_v4p0__u_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6360,8 +6633,7 @@ define void @s_shuffle_v2p0_v4p0__0_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6419,10 +6691,8 @@ define void @s_shuffle_v2p0_v4p0__1_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6477,8 +6747,7 @@ define void @s_shuffle_v2p0_v4p0__2_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6536,10 +6805,8 @@ define void @s_shuffle_v2p0_v4p0__3_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6552,18 +6819,43 @@ define void @s_shuffle_v2p0_v4p0__3_6() {
 }
 
 define void @s_shuffle_v2p0_v4p0__4_6() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__4_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__4_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__4_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__4_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 4, i32 6>
@@ -6608,10 +6900,8 @@ define void @s_shuffle_v2p0_v4p0__5_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6624,18 +6914,43 @@ define void @s_shuffle_v2p0_v4p0__5_6() {
 }
 
 define void @s_shuffle_v2p0_v4p0__6_6() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__6_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 6, i32 6>
@@ -6726,8 +7041,7 @@ define void @s_shuffle_v2p0_v4p0__0_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6782,8 +7096,7 @@ define void @s_shuffle_v2p0_v4p0__1_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6838,8 +7151,7 @@ define void @s_shuffle_v2p0_v4p0__2_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6894,8 +7206,7 @@ define void @s_shuffle_v2p0_v4p0__3_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:11]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6908,18 +7219,43 @@ define void @s_shuffle_v2p0_v4p0__3_7() {
 }
 
 define void @s_shuffle_v2p0_v4p0__4_7() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__4_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__4_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__4_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__4_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 4, i32 7>
@@ -6928,18 +7264,43 @@ define void @s_shuffle_v2p0_v4p0__4_7() {
 }
 
 define void @s_shuffle_v2p0_v4p0__5_7() {
-; GFX9-LABEL: s_shuffle_v2p0_v4p0__5_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[4:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p0_v4p0__5_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p0_v4p0__5_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p0_v4p0__5_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> <i32 5, i32 7>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll
index a15fc3212f474..261257533208b 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll
@@ -2266,8 +2266,7 @@ define void @s_shuffle_v3i64_v2i64__1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2325,8 +2324,7 @@ define void @s_shuffle_v3i64_v2i64__3_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2384,10 +2382,8 @@ define void @s_shuffle_v3i64_v2i64__3_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2441,8 +2437,7 @@ define void @s_shuffle_v3i64_v2i64__3_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2491,10 +2486,8 @@ define void @s_shuffle_v3i64_v2i64__3_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2507,18 +2500,43 @@ define void @s_shuffle_v3i64_v2i64__3_2_u() {
 }
 
 define void @s_shuffle_v3i64_v2i64__3_3_u() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__3_3_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_3_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_3_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__3_3_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> <i32 3, i32 3, i32 poison>
@@ -2572,10 +2590,8 @@ define void @s_shuffle_v3i64_v2i64__3_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2633,10 +2649,8 @@ define void @s_shuffle_v3i64_v2i64__3_3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2689,12 +2703,9 @@ define void @s_shuffle_v3i64_v2i64__3_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2707,20 +2718,48 @@ define void @s_shuffle_v3i64_v2i64__3_3_2() {
 }
 
 define void @s_shuffle_v3i64_v2i64__3_3_3() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> <i32 3, i32 3, i32 3>
@@ -2765,10 +2804,8 @@ define void @s_shuffle_v3i64_v2i64__u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2780,20 +2817,48 @@ define void @s_shuffle_v3i64_v2i64__u_0_0() {
 }
 
 define void @s_shuffle_v3i64_v2i64__0_0_0() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
@@ -2841,12 +2906,9 @@ define void @s_shuffle_v3i64_v2i64__1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2894,10 +2956,8 @@ define void @s_shuffle_v3i64_v2i64__2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2958,12 +3018,9 @@ define void @s_shuffle_v3i64_v2i64__3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3021,10 +3078,8 @@ define void @s_shuffle_v3i64_v2i64__3_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3086,12 +3141,9 @@ define void @s_shuffle_v3i64_v2i64__3_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3153,12 +3205,9 @@ define void @s_shuffle_v3i64_v2i64__3_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3171,18 +3220,43 @@ define void @s_shuffle_v3i64_v2i64__3_2_0() {
 }
 
 define void @s_shuffle_v3i64_v2i64__u_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__u_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__u_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__u_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__u_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> <i32 poison, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
@@ -3190,18 +3264,43 @@ define void @s_shuffle_v3i64_v2i64__u_1_1() {
 }
 
 define void @s_shuffle_v3i64_v2i64__0_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__0_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__0_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__0_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__0_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> <i32 0, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
@@ -3209,20 +3308,48 @@ define void @s_shuffle_v3i64_v2i64__0_1_1() {
 }
 
 define void @s_shuffle_v3i64_v2i64__1_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> <i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
@@ -3230,18 +3357,43 @@ define void @s_shuffle_v3i64_v2i64__1_1_1() {
 }
 
 define void @s_shuffle_v3i64_v2i64__2_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__2_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__2_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__2_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__2_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> <i32 2, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
@@ -3294,10 +3446,8 @@ define void @s_shuffle_v3i64_v2i64__3_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3355,10 +3505,8 @@ define void @s_shuffle_v3i64_v2i64__3_u_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3420,12 +3568,9 @@ define void @s_shuffle_v3i64_v2i64__3_0_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3487,12 +3632,9 @@ define void @s_shuffle_v3i64_v2i64__3_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3591,8 +3733,7 @@ define void @s_shuffle_v3i64_v2i64__1_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3658,12 +3799,9 @@ define void @s_shuffle_v3i64_v2i64__3_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3712,10 +3850,8 @@ define void @s_shuffle_v3i64_v2i64__3_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3773,12 +3909,9 @@ define void @s_shuffle_v3i64_v2i64__3_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3836,10 +3969,8 @@ define void @s_shuffle_v3i64_v2i64__3_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3852,18 +3983,43 @@ define void @s_shuffle_v3i64_v2i64__3_1_2() {
 }
 
 define void @s_shuffle_v3i64_v2i64__u_3_3() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__u_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__u_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__u_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__u_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> <i32 poison, i32 3, i32 3>
@@ -3917,10 +4073,8 @@ define void @s_shuffle_v3i64_v2i64__0_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3978,10 +4132,8 @@ define void @s_shuffle_v3i64_v2i64__1_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3994,18 +4146,43 @@ define void @s_shuffle_v3i64_v2i64__1_3_3() {
 }
 
 define void @s_shuffle_v3i64_v2i64__2_3_3() {
-; GFX9-LABEL: s_shuffle_v3i64_v2i64__2_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v2i64__2_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v2i64__2_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v2i64__2_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> <i32 2, i32 3, i32 3>
@@ -4050,10 +4227,8 @@ define void @s_shuffle_v3i64_v2i64__3_u_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4115,12 +4290,9 @@ define void @s_shuffle_v3i64_v2i64__3_0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4178,10 +4350,8 @@ define void @s_shuffle_v3i64_v2i64__3_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4234,12 +4404,9 @@ define void @s_shuffle_v3i64_v2i64__3_2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll
index f15dd7d2772e5..3c546bf8a3130 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll
@@ -4720,8 +4720,7 @@ define void @s_shuffle_v3i64_v3i64__1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4761,8 +4760,7 @@ define void @s_shuffle_v3i64_v3i64__2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4820,8 +4818,7 @@ define void @s_shuffle_v3i64_v3i64__4_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4862,8 +4859,7 @@ define void @s_shuffle_v3i64_v3i64__5_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4918,11 +4914,11 @@ define void @s_shuffle_v3i64_v3i64__5_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4972,8 +4968,7 @@ define void @s_shuffle_v3i64_v3i64__5_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5027,10 +5022,8 @@ define void @s_shuffle_v3i64_v3i64__5_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5075,10 +5068,8 @@ define void @s_shuffle_v3i64_v3i64__5_3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5091,18 +5082,43 @@ define void @s_shuffle_v3i64_v3i64__5_3_u() {
 }
 
 define void @s_shuffle_v3i64_v3i64__5_4_u() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_4_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_4_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 4, i32 poison>
@@ -5147,10 +5163,8 @@ define void @s_shuffle_v3i64_v3i64__5_5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5212,12 +5226,9 @@ define void @s_shuffle_v3i64_v3i64__5_5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5279,12 +5290,9 @@ define void @s_shuffle_v3i64_v3i64__5_5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5342,10 +5350,8 @@ define void @s_shuffle_v3i64_v3i64__5_5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5394,12 +5400,9 @@ define void @s_shuffle_v3i64_v3i64__5_5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5452,12 +5455,9 @@ define void @s_shuffle_v3i64_v3i64__5_5_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5470,20 +5470,48 @@ define void @s_shuffle_v3i64_v3i64__5_5_4() {
 }
 
 define void @s_shuffle_v3i64_v3i64__5_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 5, i32 5>
@@ -5528,10 +5556,8 @@ define void @s_shuffle_v3i64_v3i64__u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5543,20 +5569,48 @@ define void @s_shuffle_v3i64_v3i64__u_0_0() {
 }
 
 define void @s_shuffle_v3i64_v3i64__0_0_0() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
@@ -5604,12 +5658,9 @@ define void @s_shuffle_v3i64_v3i64__1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5657,12 +5708,9 @@ define void @s_shuffle_v3i64_v3i64__2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5710,10 +5758,8 @@ define void @s_shuffle_v3i64_v3i64__3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5771,15 +5817,12 @@ define void @s_shuffle_v3i64_v3i64__4_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5838,13 +5881,11 @@ define void @s_shuffle_v3i64_v3i64__5_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5899,11 +5940,11 @@ define void @s_shuffle_v3i64_v3i64__5_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5962,13 +6003,11 @@ define void @s_shuffle_v3i64_v3i64__5_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6022,12 +6061,9 @@ define void @s_shuffle_v3i64_v3i64__5_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6086,13 +6122,11 @@ define void @s_shuffle_v3i64_v3i64__5_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6150,10 +6184,8 @@ define void @s_shuffle_v3i64_v3i64__5_4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6166,116 +6198,12 @@ define void @s_shuffle_v3i64_v3i64__5_4_0() {
 }
 
 define void @s_shuffle_v3i64_v3i64__u_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__u_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 poison, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__0_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__0_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 0, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__1_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__2_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__2_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 2, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__3_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__3_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 3, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__4_1_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v3i64__4_1_1:
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__u_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -6283,17 +6211,12 @@ define void @s_shuffle_v3i64_v3i64__4_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v3i64__4_1_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__u_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -6301,40 +6224,30 @@ define void @s_shuffle_v3i64_v3i64__4_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v3i64__4_1_1:
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__u_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:5]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %vec1 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 4, i32 1, i32 1>
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 poison, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v3i64__5_1_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_1_1:
+define void @s_shuffle_v3i64_v3i64__0_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__0_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:9]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -6342,15 +6255,12 @@ define void @s_shuffle_v3i64_v3i64__5_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_1_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__0_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -6358,60 +6268,309 @@ define void @s_shuffle_v3i64_v3i64__5_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_1_1:
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__0_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:5]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %vec1 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 1, i32 1>
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 0, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v3i64__5_u_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_u_1:
+define void @s_shuffle_v3i64_v3i64__1_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s12
-; GFX900-NEXT:    s_mov_b32 s9, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_u_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s12
-; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__2_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__2_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__2_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__2_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 2, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__3_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__3_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__3_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__3_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 3, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__4_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__4_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__4_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__4_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %vec1 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 4, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__5_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %vec1 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__5_u_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_u_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_u_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -6425,11 +6584,11 @@ define void @s_shuffle_v3i64_v3i64__5_u_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6488,13 +6647,11 @@ define void @s_shuffle_v3i64_v3i64__5_0_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6552,12 +6709,9 @@ define void @s_shuffle_v3i64_v3i64__5_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6577,217 +6731,344 @@ define void @s_shuffle_v3i64_v3i64__5_3_1() {
 ; GFX900-NEXT:    ; def s[4:9]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:17]
+; GFX900-NEXT:    ; def s[12:17]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:17]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_3_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:9]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %vec1 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 3, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__5_4_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_4_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %vec1 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 4, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__u_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__u_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__u_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__u_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 poison, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__0_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__0_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__0_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__0_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 0, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__1_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__1_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__1_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__1_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 1, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v3i64__2_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:17]
+; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_3_1:
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:5]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:9]
+; GFX942-NEXT:    ; def s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %vec1 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 3, i32 1>
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v3i64__5_4_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_1:
+define void @s_shuffle_v3i64_v3i64__3_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__3_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s12
-; GFX900-NEXT:    s_mov_b32 s9, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__3_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s12
-; GFX90A-NEXT:    s_mov_b32 s9, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_4_1:
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__3_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:5]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %vec1 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 4, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__u_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__u_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 poison, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__0_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__0_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 0, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__1_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__1_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 1, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__2_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v3i64__3_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__3_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> <i32 3, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
@@ -6840,10 +7121,8 @@ define void @s_shuffle_v3i64_v3i64__4_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6897,10 +7176,8 @@ define void @s_shuffle_v3i64_v3i64__5_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6950,8 +7227,7 @@ define void @s_shuffle_v3i64_v3i64__5_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7009,12 +7285,9 @@ define void @s_shuffle_v3i64_v3i64__5_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7064,8 +7337,7 @@ define void @s_shuffle_v3i64_v3i64__5_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7119,10 +7391,8 @@ define void @s_shuffle_v3i64_v3i64__5_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7180,10 +7450,8 @@ define void @s_shuffle_v3i64_v3i64__5_4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7282,8 +7550,7 @@ define void @s_shuffle_v3i64_v3i64__1_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7323,8 +7590,7 @@ define void @s_shuffle_v3i64_v3i64__2_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7390,12 +7656,9 @@ define void @s_shuffle_v3i64_v3i64__4_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7444,12 +7707,9 @@ define void @s_shuffle_v3i64_v3i64__5_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7494,10 +7754,8 @@ define void @s_shuffle_v3i64_v3i64__5_u_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7552,13 +7810,11 @@ define void @s_shuffle_v3i64_v3i64__5_0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7612,10 +7868,8 @@ define void @s_shuffle_v3i64_v3i64__5_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7673,10 +7927,8 @@ define void @s_shuffle_v3i64_v3i64__5_2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:17]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7725,12 +7977,9 @@ define void @s_shuffle_v3i64_v3i64__5_4_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7743,18 +7992,43 @@ define void @s_shuffle_v3i64_v3i64__5_4_3() {
 }
 
 define void @s_shuffle_v3i64_v3i64__u_4_4() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__u_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__u_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__u_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__u_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 poison, i32 4, i32 4>
@@ -7808,10 +8082,8 @@ define void @s_shuffle_v3i64_v3i64__0_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7869,10 +8141,8 @@ define void @s_shuffle_v3i64_v3i64__1_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7930,10 +8200,8 @@ define void @s_shuffle_v3i64_v3i64__2_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7946,18 +8214,43 @@ define void @s_shuffle_v3i64_v3i64__2_4_4() {
 }
 
 define void @s_shuffle_v3i64_v3i64__3_4_4() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__3_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__3_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__3_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__3_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 3, i32 4, i32 4>
@@ -7966,20 +8259,48 @@ define void @s_shuffle_v3i64_v3i64__3_4_4() {
 }
 
 define void @s_shuffle_v3i64_v3i64__4_4_4() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 4, i32 4, i32 4>
@@ -7988,20 +8309,48 @@ define void @s_shuffle_v3i64_v3i64__4_4_4() {
 }
 
 define void @s_shuffle_v3i64_v3i64__5_4_4() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 4, i32 4>
@@ -8042,10 +8391,8 @@ define void @s_shuffle_v3i64_v3i64__5_u_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8104,13 +8451,11 @@ define void @s_shuffle_v3i64_v3i64__5_0_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8164,10 +8509,8 @@ define void @s_shuffle_v3i64_v3i64__5_1_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8225,12 +8568,9 @@ define void @s_shuffle_v3i64_v3i64__5_2_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8279,12 +8619,9 @@ define void @s_shuffle_v3i64_v3i64__5_3_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8297,18 +8634,43 @@ define void @s_shuffle_v3i64_v3i64__5_3_4() {
 }
 
 define void @s_shuffle_v3i64_v3i64__u_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__u_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__u_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__u_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__u_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 poison, i32 5, i32 5>
@@ -8362,10 +8724,8 @@ define void @s_shuffle_v3i64_v3i64__0_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8423,10 +8783,8 @@ define void @s_shuffle_v3i64_v3i64__1_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8484,10 +8842,8 @@ define void @s_shuffle_v3i64_v3i64__2_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8500,18 +8856,43 @@ define void @s_shuffle_v3i64_v3i64__2_5_5() {
 }
 
 define void @s_shuffle_v3i64_v3i64__3_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__3_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__3_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__3_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__3_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 3, i32 5, i32 5>
@@ -8520,20 +8901,48 @@ define void @s_shuffle_v3i64_v3i64__3_5_5() {
 }
 
 define void @s_shuffle_v3i64_v3i64__4_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__4_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__4_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__4_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__4_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 4, i32 5, i32 5>
@@ -8542,18 +8951,43 @@ define void @s_shuffle_v3i64_v3i64__4_5_5() {
 }
 
 define void @s_shuffle_v3i64_v3i64__5_u_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_u_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_u_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_u_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_u_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 poison, i32 5>
@@ -8607,10 +9041,8 @@ define void @s_shuffle_v3i64_v3i64__5_0_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8668,10 +9100,8 @@ define void @s_shuffle_v3i64_v3i64__5_1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8729,10 +9159,8 @@ define void @s_shuffle_v3i64_v3i64__5_2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8785,12 +9213,9 @@ define void @s_shuffle_v3i64_v3i64__5_3_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8803,18 +9228,43 @@ define void @s_shuffle_v3i64_v3i64__5_3_5() {
 }
 
 define void @s_shuffle_v3i64_v3i64__5_4_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v3i64__5_4_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_4_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> <i32 5, i32 4, i32 5>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll
index 6e156d2d4a2f5..7815761d29696 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll
@@ -8016,8 +8016,7 @@ define void @s_shuffle_v3i64_v4i64__1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8057,8 +8056,7 @@ define void @s_shuffle_v3i64_v4i64__2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8102,8 +8100,7 @@ define void @s_shuffle_v3i64_v4i64__3_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8161,8 +8158,7 @@ define void @s_shuffle_v3i64_v4i64__5_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8203,8 +8199,7 @@ define void @s_shuffle_v3i64_v4i64__6_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8249,8 +8244,7 @@ define void @s_shuffle_v3i64_v4i64__7_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8309,10 +8303,8 @@ define void @s_shuffle_v3i64_v4i64__7_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8366,8 +8358,7 @@ define void @s_shuffle_v3i64_v4i64__7_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8425,10 +8416,8 @@ define void @s_shuffle_v3i64_v4i64__7_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8482,10 +8471,8 @@ define void @s_shuffle_v3i64_v4i64__7_3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8534,10 +8521,8 @@ define void @s_shuffle_v3i64_v4i64__7_4_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8550,18 +8535,43 @@ define void @s_shuffle_v3i64_v4i64__7_4_u() {
 }
 
 define void @s_shuffle_v3i64_v4i64__7_5_u() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_5_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 poison>
@@ -8606,10 +8616,8 @@ define void @s_shuffle_v3i64_v4i64__7_6_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8654,10 +8662,8 @@ define void @s_shuffle_v3i64_v4i64__7_7_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8716,13 +8722,11 @@ define void @s_shuffle_v3i64_v4i64__7_7_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8781,13 +8785,11 @@ define void @s_shuffle_v3i64_v4i64__7_7_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8841,10 +8843,8 @@ define void @s_shuffle_v3i64_v4i64__7_7_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8902,12 +8902,9 @@ define void @s_shuffle_v3i64_v4i64__7_7_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8956,12 +8953,9 @@ define void @s_shuffle_v3i64_v4i64__7_7_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9010,12 +9004,9 @@ define void @s_shuffle_v3i64_v4i64__7_7_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9028,20 +9019,48 @@ define void @s_shuffle_v3i64_v4i64__7_7_5() {
 }
 
 define void @s_shuffle_v3i64_v4i64__7_7_6() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_7_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_7_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_7_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 7, i32 6>
@@ -9090,12 +9109,9 @@ define void @s_shuffle_v3i64_v4i64__7_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9144,10 +9160,8 @@ define void @s_shuffle_v3i64_v4i64__u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9159,20 +9173,48 @@ define void @s_shuffle_v3i64_v4i64__u_0_0() {
 }
 
 define void @s_shuffle_v3i64_v4i64__0_0_0() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
@@ -9220,12 +9262,9 @@ define void @s_shuffle_v3i64_v4i64__1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9273,12 +9312,9 @@ define void @s_shuffle_v3i64_v4i64__2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9330,12 +9366,9 @@ define void @s_shuffle_v3i64_v4i64__3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9383,10 +9416,8 @@ define void @s_shuffle_v3i64_v4i64__4_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9444,15 +9475,12 @@ define void @s_shuffle_v3i64_v4i64__5_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9511,13 +9539,11 @@ define void @s_shuffle_v3i64_v4i64__6_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9576,15 +9602,12 @@ define void @s_shuffle_v3i64_v4i64__7_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9639,13 +9662,11 @@ define void @s_shuffle_v3i64_v4i64__7_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9704,15 +9725,12 @@ define void @s_shuffle_v3i64_v4i64__7_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9770,12 +9788,9 @@ define void @s_shuffle_v3i64_v4i64__7_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9833,12 +9848,9 @@ define void @s_shuffle_v3i64_v4i64__7_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9897,15 +9909,12 @@ define void @s_shuffle_v3i64_v4i64__7_4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9963,10 +9972,8 @@ define void @s_shuffle_v3i64_v4i64__7_5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10028,12 +10035,9 @@ define void @s_shuffle_v3i64_v4i64__7_6_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10046,137 +10050,12 @@ define void @s_shuffle_v3i64_v4i64__7_6_0() {
 }
 
 define void @s_shuffle_v3i64_v4i64__u_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 poison, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__0_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__0_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 0, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__1_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__2_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__2_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 2, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__3_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__3_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 3, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__4_1_1() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 4, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__5_1_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_1_1:
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10184,17 +10063,12 @@ define void @s_shuffle_v3i64_v4i64__5_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_1_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10202,42 +10076,30 @@ define void @s_shuffle_v3i64_v4i64__5_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_1_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 poison, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__6_1_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_1_1:
+define void @s_shuffle_v3i64_v4i64__0_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10245,17 +10107,12 @@ define void @s_shuffle_v3i64_v4i64__6_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_1_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10263,42 +10120,32 @@ define void @s_shuffle_v3i64_v4i64__6_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_1_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 0, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_1_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_1:
+define void @s_shuffle_v3i64_v4i64__1_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10306,17 +10153,14 @@ define void @s_shuffle_v3i64_v4i64__7_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10324,200 +10168,204 @@ define void @s_shuffle_v3i64_v4i64__7_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_u_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_1:
+define void @s_shuffle_v3i64_v4i64__2_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 2, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_0_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_1:
+define void @s_shuffle_v3i64_v4i64__3_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 3, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__4_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 4, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_2_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_1:
+define void @s_shuffle_v3i64_v4i64__5_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -10526,61 +10374,57 @@ define void @s_shuffle_v3i64_v4i64__7_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_3_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_1:
+define void @s_shuffle_v3i64_v4i64__6_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -10589,92 +10433,80 @@ define void @s_shuffle_v3i64_v4i64__7_3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_4_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_1:
+define void @s_shuffle_v3i64_v4i64__7_1_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_5_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_1:
+define void @s_shuffle_v3i64_v4i64__7_u_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10692,7 +10524,7 @@ define void @s_shuffle_v3i64_v4i64__7_5_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10710,32 +10542,30 @@ define void @s_shuffle_v3i64_v4i64__7_5_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 1>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_6_1() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_1:
+define void @s_shuffle_v3i64_v4i64__7_0_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10746,8 +10576,8 @@ define void @s_shuffle_v3i64_v4i64__7_6_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10755,7 +10585,7 @@ define void @s_shuffle_v3i64_v4i64__7_6_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_1:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10766,8 +10596,8 @@ define void @s_shuffle_v3i64_v4i64__7_6_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10775,190 +10605,131 @@ define void @s_shuffle_v3i64_v4i64__7_6_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_1:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__u_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 poison, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__0_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__0_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 0, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__1_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__1_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 1, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__2_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__3_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__3_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 3, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__4_2_2() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+define void @s_shuffle_v3i64_v4i64__7_2_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 4, i32 2, i32 2>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__5_2_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_2_2:
+define void @s_shuffle_v3i64_v4i64__7_3_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_2_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_2_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -10967,295 +10738,1418 @@ define void @s_shuffle_v3i64_v4i64__5_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__6_2_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_2_2:
+define void @s_shuffle_v3i64_v4i64__7_4_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_2_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_2_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_2_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_2:
+define void @s_shuffle_v3i64_v4i64__7_5_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_6_1() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__u_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 poison, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__0_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 0, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__1_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 1, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__2_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 2, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__3_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 3, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__4_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 4, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__5_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__6_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_2_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_u_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_0_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_1_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_3_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_4_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_5_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__7_6_2() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__u_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 poison, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__0_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 0, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__1_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 1, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__2_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 2, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_u_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_2:
+define void @s_shuffle_v3i64_v4i64__3_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_0_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_2:
+define void @s_shuffle_v3i64_v4i64__4_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 4, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_1_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_2:
+define void @s_shuffle_v3i64_v4i64__5_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11264,21 +12158,22 @@ define void @s_shuffle_v3i64_v4i64__7_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_3_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_2:
+define void @s_shuffle_v3i64_v4i64__6_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11287,16 +12182,16 @@ define void @s_shuffle_v3i64_v4i64__7_3_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s10, s14
 ; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11305,16 +12200,16 @@ define void @s_shuffle_v3i64_v4i64__7_3_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s10, s14
 ; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11323,23 +12218,22 @@ define void @s_shuffle_v3i64_v4i64__7_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_4_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_2:
+define void @s_shuffle_v3i64_v4i64__7_3_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11350,14 +12244,16 @@ define void @s_shuffle_v3i64_v4i64__7_4_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11368,14 +12264,16 @@ define void @s_shuffle_v3i64_v4i64__7_4_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11384,59 +12282,58 @@ define void @s_shuffle_v3i64_v4i64__7_4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_5_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_2:
+define void @s_shuffle_v3i64_v4i64__7_u_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11445,59 +12342,61 @@ define void @s_shuffle_v3i64_v4i64__7_5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_6_2() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_2:
+define void @s_shuffle_v3i64_v4i64__7_0_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_2:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_2:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11506,30 +12405,32 @@ define void @s_shuffle_v3i64_v4i64__7_6_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__u_3_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_3_3:
+define void @s_shuffle_v3i64_v4i64__7_1_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11537,14 +12438,17 @@ define void @s_shuffle_v3i64_v4i64__u_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_3_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11552,58 +12456,42 @@ define void @s_shuffle_v3i64_v4i64__u_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_3_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 poison, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__0_3_3() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__0_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 0, i32 3, i32 3>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__1_3_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_3_3:
+define void @s_shuffle_v3i64_v4i64__7_2_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11611,16 +12499,19 @@ define void @s_shuffle_v3i64_v4i64__1_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_3_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11628,39 +12519,43 @@ define void @s_shuffle_v3i64_v4i64__1_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_3_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 1, i32 3, i32 3>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__2_3_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_3_3:
+define void @s_shuffle_v3i64_v4i64__7_4_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s12
-; GFX900-NEXT:    s_mov_b32 s9, s13
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11668,16 +12563,19 @@ define void @s_shuffle_v3i64_v4i64__2_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_3_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s12
-; GFX90A-NEXT:    s_mov_b32 s9, s13
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11685,94 +12583,102 @@ define void @s_shuffle_v3i64_v4i64__2_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_3_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 2, i32 3, i32 3>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__3_3_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_3_3:
+define void @s_shuffle_v3i64_v4i64__7_5_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_3_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_3_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 3, i32 3, i32 3>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__4_3_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_3_3:
+define void @s_shuffle_v3i64_v4i64__7_6_3() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11780,14 +12686,19 @@ define void @s_shuffle_v3i64_v4i64__4_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_3_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11795,649 +12706,531 @@ define void @s_shuffle_v3i64_v4i64__4_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_3_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 4, i32 3, i32 3>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__5_3_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_3_3:
+define void @s_shuffle_v3i64_v4i64__u_4_4() {
+; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_4_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:13]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 poison, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__0_4_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_3_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_3_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 0, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__6_3_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_3_3:
+define void @s_shuffle_v3i64_v4i64__1_4_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_3_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_3_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 1, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_3_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_3:
+define void @s_shuffle_v3i64_v4i64__2_4_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_3:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 2, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_u_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_3:
+define void @s_shuffle_v3i64_v4i64__3_4_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 3, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_0_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_3:
+define void @s_shuffle_v3i64_v4i64__4_4_4() {
+; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_4_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:13]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__5_4_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_1_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_3:
+define void @s_shuffle_v3i64_v4i64__6_4_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_2_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_3:
+define void @s_shuffle_v3i64_v4i64__7_4_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_4_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_3:
+define void @s_shuffle_v3i64_v4i64__7_u_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_5_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_3:
+define void @s_shuffle_v3i64_v4i64__7_0_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_6_3() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_3:
+define void @s_shuffle_v3i64_v4i64__7_1_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_3:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_3:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -12446,831 +13239,783 @@ define void @s_shuffle_v3i64_v4i64__7_6_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__u_4_4() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 poison, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__0_4_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_4_4:
+define void @s_shuffle_v3i64_v4i64__7_2_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_4_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_4_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 0, i32 4, i32 4>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__1_4_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_4_4:
+define void @s_shuffle_v3i64_v4i64__7_3_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_4_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_4_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 1, i32 4, i32 4>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__2_4_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_4_4:
+define void @s_shuffle_v3i64_v4i64__7_5_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_4_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_4_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 2, i32 4, i32 4>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__3_4_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_4_4:
+define void @s_shuffle_v3i64_v4i64__7_6_4() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_4_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_4_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 3, i32 4, i32 4>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__4_4_4() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> <i32 4, i32 4, i32 4>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__5_4_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_4_4:
+define void @s_shuffle_v3i64_v4i64__u_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_4_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_4_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 poison, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__6_4_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_4_4:
+define void @s_shuffle_v3i64_v4i64__0_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_4_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_4_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 0, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_4_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_4:
+define void @s_shuffle_v3i64_v4i64__1_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 1, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_u_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_4:
+define void @s_shuffle_v3i64_v4i64__2_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 2, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_0_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_4:
+define void @s_shuffle_v3i64_v4i64__3_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 3, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_1_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_4:
+define void @s_shuffle_v3i64_v4i64__4_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 4, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_2_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_4:
+define void @s_shuffle_v3i64_v4i64__5_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_3_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_4:
+define void @s_shuffle_v3i64_v4i64__6_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_5_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_4:
+define void @s_shuffle_v3i64_v4i64__7_5_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s6
-; GFX900-NEXT:    s_mov_b32 s11, s7
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s6
-; GFX90A-NEXT:    s_mov_b32 s11, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_6_4() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_4:
+define void @s_shuffle_v3i64_v4i64__7_u_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_4:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_4:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 4>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__u_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 poison, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__0_5_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_5_5:
+define void @s_shuffle_v3i64_v4i64__7_0_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13278,17 +14023,19 @@ define void @s_shuffle_v3i64_v4i64__0_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_5_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13296,68 +14043,68 @@ define void @s_shuffle_v3i64_v4i64__0_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_5_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 0, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__1_5_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_5_5:
+define void @s_shuffle_v3i64_v4i64__7_1_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_5_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_5_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -13366,231 +14113,145 @@ define void @s_shuffle_v3i64_v4i64__1_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 1, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__2_5_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_5_5:
+define void @s_shuffle_v3i64_v4i64__7_2_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_5_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_5_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 2, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__3_5_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_5_5:
+define void @s_shuffle_v3i64_v4i64__7_3_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_5_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_5_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 3, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__4_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 4, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__5_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__6_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__6_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__7_5_5() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_u_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_5:
+define void @s_shuffle_v3i64_v4i64__7_4_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13598,6 +14259,8 @@ define void @s_shuffle_v3i64_v4i64__7_u_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13605,7 +14268,7 @@ define void @s_shuffle_v3i64_v4i64__7_u_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13613,6 +14276,8 @@ define void @s_shuffle_v3i64_v4i64__7_u_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13620,41 +14285,37 @@ define void @s_shuffle_v3i64_v4i64__7_u_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_0_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_5:
+define void @s_shuffle_v3i64_v4i64__7_6_5() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13662,19 +14323,16 @@ define void @s_shuffle_v3i64_v4i64__7_0_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13682,35 +14340,73 @@ define void @s_shuffle_v3i64_v4i64__7_0_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_0_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i64_v4i64__u_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 0, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 poison, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_1_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_5:
+define void @s_shuffle_v3i64_v4i64__0_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13719,16 +14415,16 @@ define void @s_shuffle_v3i64_v4i64__7_1_5() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13737,16 +14433,16 @@ define void @s_shuffle_v3i64_v4i64__7_1_5() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_1_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -13755,360 +14451,300 @@ define void @s_shuffle_v3i64_v4i64__7_1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 1, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 0, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_2_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_5:
+define void @s_shuffle_v3i64_v4i64__1_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_2_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 2, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 1, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_3_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_5:
+define void @s_shuffle_v3i64_v4i64__2_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_3_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 3, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 2, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_4_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_5:
+define void @s_shuffle_v3i64_v4i64__3_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 4, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 3, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__7_6_5() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_5:
+define void @s_shuffle_v3i64_v4i64__4_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_5:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_5:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__u_6_6() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__u_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 poison, i32 6, i32 6>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 4, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__0_6_6() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_6_6:
+define void @s_shuffle_v3i64_v4i64__5_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_6_6:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__0_6_6:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 0, i32 6, i32 6>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__1_6_6() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_6_6:
+define void @s_shuffle_v3i64_v4i64__6_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -14116,17 +14752,14 @@ define void @s_shuffle_v3i64_v4i64__1_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_6_6:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -14134,42 +14767,34 @@ define void @s_shuffle_v3i64_v4i64__1_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_6_6:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 1, i32 6, i32 6>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__2_6_6() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_6_6:
+define void @s_shuffle_v3i64_v4i64__7_6_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_6:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -14177,17 +14802,14 @@ define void @s_shuffle_v3i64_v4i64__2_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_6_6:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -14195,190 +14817,63 @@ define void @s_shuffle_v3i64_v4i64__2_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_6_6:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 2, i32 6, i32 6>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3i64_v4i64__3_6_6() {
-; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_6_6:
+define void @s_shuffle_v3i64_v4i64__7_u_6() {
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_6_6:
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_6_6:
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 3, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__4_6_6() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 4, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__5_6_6() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__5_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 5, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__6_6_6() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__7_6_6() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i64_v4i64__7_u_6() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_u_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 poison, i32 6>
@@ -14432,10 +14927,8 @@ define void @s_shuffle_v3i64_v4i64__7_0_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14493,10 +14986,8 @@ define void @s_shuffle_v3i64_v4i64__7_1_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14554,10 +15045,8 @@ define void @s_shuffle_v3i64_v4i64__7_2_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14615,10 +15104,8 @@ define void @s_shuffle_v3i64_v4i64__7_3_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14671,12 +15158,9 @@ define void @s_shuffle_v3i64_v4i64__7_4_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14689,18 +15173,43 @@ define void @s_shuffle_v3i64_v4i64__7_4_6() {
 }
 
 define void @s_shuffle_v3i64_v4i64__7_5_6() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_5_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 6>
@@ -14745,10 +15254,8 @@ define void @s_shuffle_v3i64_v4i64__u_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14806,10 +15313,8 @@ define void @s_shuffle_v3i64_v4i64__0_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14871,12 +15376,9 @@ define void @s_shuffle_v3i64_v4i64__1_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14934,12 +15436,9 @@ define void @s_shuffle_v3i64_v4i64__2_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15001,12 +15500,9 @@ define void @s_shuffle_v3i64_v4i64__3_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15019,20 +15515,48 @@ define void @s_shuffle_v3i64_v4i64__3_7_7() {
 }
 
 define void @s_shuffle_v3i64_v4i64__4_7_7() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__4_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 4, i32 7, i32 7>
@@ -15081,12 +15605,9 @@ define void @s_shuffle_v3i64_v4i64__5_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15139,12 +15660,9 @@ define void @s_shuffle_v3i64_v4i64__6_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15193,10 +15711,8 @@ define void @s_shuffle_v3i64_v4i64__7_u_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15258,12 +15774,9 @@ define void @s_shuffle_v3i64_v4i64__7_0_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15321,10 +15834,8 @@ define void @s_shuffle_v3i64_v4i64__7_1_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15386,12 +15897,9 @@ define void @s_shuffle_v3i64_v4i64__7_2_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15449,12 +15957,9 @@ define void @s_shuffle_v3i64_v4i64__7_3_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15507,12 +16012,9 @@ define void @s_shuffle_v3i64_v4i64__7_4_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15525,20 +16027,48 @@ define void @s_shuffle_v3i64_v4i64__7_4_7() {
 }
 
 define void @s_shuffle_v3i64_v4i64__7_5_7() {
-; GFX9-LABEL: s_shuffle_v3i64_v4i64__7_5_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_5_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> <i32 7, i32 5, i32 7>
@@ -15587,12 +16117,9 @@ define void @s_shuffle_v3i64_v4i64__7_6_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll
index fe132493ce536..fdba1e81224d0 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll
@@ -2266,8 +2266,7 @@ define void @s_shuffle_v3p0_v2p0__1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2325,8 +2324,7 @@ define void @s_shuffle_v3p0_v2p0__3_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2384,10 +2382,8 @@ define void @s_shuffle_v3p0_v2p0__3_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2441,8 +2437,7 @@ define void @s_shuffle_v3p0_v2p0__3_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2491,10 +2486,8 @@ define void @s_shuffle_v3p0_v2p0__3_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2507,18 +2500,43 @@ define void @s_shuffle_v3p0_v2p0__3_2_u() {
 }
 
 define void @s_shuffle_v3p0_v2p0__3_3_u() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__3_3_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_3_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_3_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__3_3_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> <i32 3, i32 3, i32 poison>
@@ -2572,10 +2590,8 @@ define void @s_shuffle_v3p0_v2p0__3_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2633,10 +2649,8 @@ define void @s_shuffle_v3p0_v2p0__3_3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2689,12 +2703,9 @@ define void @s_shuffle_v3p0_v2p0__3_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2707,20 +2718,48 @@ define void @s_shuffle_v3p0_v2p0__3_3_2() {
 }
 
 define void @s_shuffle_v3p0_v2p0__3_3_3() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> <i32 3, i32 3, i32 3>
@@ -2765,10 +2804,8 @@ define void @s_shuffle_v3p0_v2p0__u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2780,20 +2817,48 @@ define void @s_shuffle_v3p0_v2p0__u_0_0() {
 }
 
 define void @s_shuffle_v3p0_v2p0__0_0_0() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
@@ -2841,12 +2906,9 @@ define void @s_shuffle_v3p0_v2p0__1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2894,10 +2956,8 @@ define void @s_shuffle_v3p0_v2p0__2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2958,12 +3018,9 @@ define void @s_shuffle_v3p0_v2p0__3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3021,10 +3078,8 @@ define void @s_shuffle_v3p0_v2p0__3_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3086,12 +3141,9 @@ define void @s_shuffle_v3p0_v2p0__3_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3153,12 +3205,9 @@ define void @s_shuffle_v3p0_v2p0__3_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3171,18 +3220,43 @@ define void @s_shuffle_v3p0_v2p0__3_2_0() {
 }
 
 define void @s_shuffle_v3p0_v2p0__u_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__u_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__u_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__u_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__u_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> <i32 poison, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
@@ -3190,18 +3264,43 @@ define void @s_shuffle_v3p0_v2p0__u_1_1() {
 }
 
 define void @s_shuffle_v3p0_v2p0__0_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__0_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__0_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__0_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__0_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> <i32 0, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
@@ -3209,20 +3308,48 @@ define void @s_shuffle_v3p0_v2p0__0_1_1() {
 }
 
 define void @s_shuffle_v3p0_v2p0__1_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> <i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
@@ -3230,18 +3357,43 @@ define void @s_shuffle_v3p0_v2p0__1_1_1() {
 }
 
 define void @s_shuffle_v3p0_v2p0__2_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__2_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__2_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__2_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__2_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> <i32 2, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
@@ -3294,10 +3446,8 @@ define void @s_shuffle_v3p0_v2p0__3_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3355,10 +3505,8 @@ define void @s_shuffle_v3p0_v2p0__3_u_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3420,12 +3568,9 @@ define void @s_shuffle_v3p0_v2p0__3_0_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3487,12 +3632,9 @@ define void @s_shuffle_v3p0_v2p0__3_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3591,8 +3733,7 @@ define void @s_shuffle_v3p0_v2p0__1_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3658,12 +3799,9 @@ define void @s_shuffle_v3p0_v2p0__3_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3712,10 +3850,8 @@ define void @s_shuffle_v3p0_v2p0__3_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3773,12 +3909,9 @@ define void @s_shuffle_v3p0_v2p0__3_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3836,10 +3969,8 @@ define void @s_shuffle_v3p0_v2p0__3_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3852,18 +3983,43 @@ define void @s_shuffle_v3p0_v2p0__3_1_2() {
 }
 
 define void @s_shuffle_v3p0_v2p0__u_3_3() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__u_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__u_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__u_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__u_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> <i32 poison, i32 3, i32 3>
@@ -3917,10 +4073,8 @@ define void @s_shuffle_v3p0_v2p0__0_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3978,10 +4132,8 @@ define void @s_shuffle_v3p0_v2p0__1_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3994,18 +4146,43 @@ define void @s_shuffle_v3p0_v2p0__1_3_3() {
 }
 
 define void @s_shuffle_v3p0_v2p0__2_3_3() {
-; GFX9-LABEL: s_shuffle_v3p0_v2p0__2_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v2p0__2_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v2p0__2_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v2p0__2_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> <i32 2, i32 3, i32 3>
@@ -4050,10 +4227,8 @@ define void @s_shuffle_v3p0_v2p0__3_u_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4115,12 +4290,9 @@ define void @s_shuffle_v3p0_v2p0__3_0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4178,10 +4350,8 @@ define void @s_shuffle_v3p0_v2p0__3_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4234,12 +4404,9 @@ define void @s_shuffle_v3p0_v2p0__3_2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll
index b6f4e3091b61f..422bcb5b4414a 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll
@@ -4720,8 +4720,7 @@ define void @s_shuffle_v3p0_v3p0__1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4761,8 +4760,7 @@ define void @s_shuffle_v3p0_v3p0__2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4820,8 +4818,7 @@ define void @s_shuffle_v3p0_v3p0__4_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4862,8 +4859,7 @@ define void @s_shuffle_v3p0_v3p0__5_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4918,11 +4914,11 @@ define void @s_shuffle_v3p0_v3p0__5_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4972,8 +4968,7 @@ define void @s_shuffle_v3p0_v3p0__5_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5027,10 +5022,8 @@ define void @s_shuffle_v3p0_v3p0__5_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5075,10 +5068,8 @@ define void @s_shuffle_v3p0_v3p0__5_3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5091,18 +5082,43 @@ define void @s_shuffle_v3p0_v3p0__5_3_u() {
 }
 
 define void @s_shuffle_v3p0_v3p0__5_4_u() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_4_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_4_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 4, i32 poison>
@@ -5147,10 +5163,8 @@ define void @s_shuffle_v3p0_v3p0__5_5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5212,12 +5226,9 @@ define void @s_shuffle_v3p0_v3p0__5_5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5279,12 +5290,9 @@ define void @s_shuffle_v3p0_v3p0__5_5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5342,10 +5350,8 @@ define void @s_shuffle_v3p0_v3p0__5_5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5394,12 +5400,9 @@ define void @s_shuffle_v3p0_v3p0__5_5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5452,12 +5455,9 @@ define void @s_shuffle_v3p0_v3p0__5_5_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5470,20 +5470,48 @@ define void @s_shuffle_v3p0_v3p0__5_5_4() {
 }
 
 define void @s_shuffle_v3p0_v3p0__5_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 5, i32 5>
@@ -5528,10 +5556,8 @@ define void @s_shuffle_v3p0_v3p0__u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5543,20 +5569,48 @@ define void @s_shuffle_v3p0_v3p0__u_0_0() {
 }
 
 define void @s_shuffle_v3p0_v3p0__0_0_0() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
@@ -5604,12 +5658,9 @@ define void @s_shuffle_v3p0_v3p0__1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5657,12 +5708,9 @@ define void @s_shuffle_v3p0_v3p0__2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5710,10 +5758,8 @@ define void @s_shuffle_v3p0_v3p0__3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5771,15 +5817,12 @@ define void @s_shuffle_v3p0_v3p0__4_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5838,13 +5881,11 @@ define void @s_shuffle_v3p0_v3p0__5_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5899,11 +5940,11 @@ define void @s_shuffle_v3p0_v3p0__5_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5962,13 +6003,11 @@ define void @s_shuffle_v3p0_v3p0__5_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6022,12 +6061,9 @@ define void @s_shuffle_v3p0_v3p0__5_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6086,13 +6122,11 @@ define void @s_shuffle_v3p0_v3p0__5_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6150,10 +6184,8 @@ define void @s_shuffle_v3p0_v3p0__5_4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6166,116 +6198,12 @@ define void @s_shuffle_v3p0_v3p0__5_4_0() {
 }
 
 define void @s_shuffle_v3p0_v3p0__u_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__u_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 poison, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__0_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__0_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 0, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__1_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__2_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__2_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 2, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__3_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__3_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 3, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__4_1_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v3p0__4_1_1:
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__u_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -6283,17 +6211,12 @@ define void @s_shuffle_v3p0_v3p0__4_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v3p0__4_1_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__u_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -6301,40 +6224,30 @@ define void @s_shuffle_v3p0_v3p0__4_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v3p0__4_1_1:
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__u_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:5]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 4, i32 1, i32 1>
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 poison, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v3p0__5_1_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_1_1:
+define void @s_shuffle_v3p0_v3p0__0_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__0_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:9]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -6342,15 +6255,12 @@ define void @s_shuffle_v3p0_v3p0__5_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_1_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__0_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -6358,60 +6268,309 @@ define void @s_shuffle_v3p0_v3p0__5_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_1_1:
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__0_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:5]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 1, i32 1>
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 0, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v3p0__5_u_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_u_1:
+define void @s_shuffle_v3p0_v3p0__1_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s12
-; GFX900-NEXT:    s_mov_b32 s9, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_u_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s12
-; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__2_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__2_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__2_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__2_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 2, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__3_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__3_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__3_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__3_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 3, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__4_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__4_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__4_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__4_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 4, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__5_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__5_u_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_u_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_u_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -6425,11 +6584,11 @@ define void @s_shuffle_v3p0_v3p0__5_u_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6488,13 +6647,11 @@ define void @s_shuffle_v3p0_v3p0__5_0_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6552,12 +6709,9 @@ define void @s_shuffle_v3p0_v3p0__5_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6577,217 +6731,344 @@ define void @s_shuffle_v3p0_v3p0__5_3_1() {
 ; GFX900-NEXT:    ; def s[4:9]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:17]
+; GFX900-NEXT:    ; def s[12:17]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:17]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_3_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:9]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 3, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__5_4_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_4_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:5]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 4, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__u_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__u_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__u_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__u_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 poison, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__0_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__0_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__0_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__0_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 0, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__1_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__1_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__1_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__1_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 1, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v3p0__2_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:17]
+; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_3_1:
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:5]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:9]
+; GFX942-NEXT:    ; def s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 3, i32 1>
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v3p0__5_4_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_1:
+define void @s_shuffle_v3p0_v3p0__3_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__3_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s12
-; GFX900-NEXT:    s_mov_b32 s9, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__3_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s12
-; GFX90A-NEXT:    s_mov_b32 s9, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_4_1:
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__3_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:5]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 4, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__u_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__u_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 poison, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__0_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__0_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 0, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__1_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__1_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 1, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__2_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v3p0__3_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__3_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> <i32 3, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
@@ -6840,10 +7121,8 @@ define void @s_shuffle_v3p0_v3p0__4_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6897,10 +7176,8 @@ define void @s_shuffle_v3p0_v3p0__5_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6950,8 +7227,7 @@ define void @s_shuffle_v3p0_v3p0__5_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7009,12 +7285,9 @@ define void @s_shuffle_v3p0_v3p0__5_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7064,8 +7337,7 @@ define void @s_shuffle_v3p0_v3p0__5_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7119,10 +7391,8 @@ define void @s_shuffle_v3p0_v3p0__5_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7180,10 +7450,8 @@ define void @s_shuffle_v3p0_v3p0__5_4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7282,8 +7550,7 @@ define void @s_shuffle_v3p0_v3p0__1_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7323,8 +7590,7 @@ define void @s_shuffle_v3p0_v3p0__2_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7390,12 +7656,9 @@ define void @s_shuffle_v3p0_v3p0__4_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7444,12 +7707,9 @@ define void @s_shuffle_v3p0_v3p0__5_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7494,10 +7754,8 @@ define void @s_shuffle_v3p0_v3p0__5_u_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7552,13 +7810,11 @@ define void @s_shuffle_v3p0_v3p0__5_0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7612,10 +7868,8 @@ define void @s_shuffle_v3p0_v3p0__5_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7673,10 +7927,8 @@ define void @s_shuffle_v3p0_v3p0__5_2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[12:17]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7725,12 +7977,9 @@ define void @s_shuffle_v3p0_v3p0__5_4_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7743,18 +7992,43 @@ define void @s_shuffle_v3p0_v3p0__5_4_3() {
 }
 
 define void @s_shuffle_v3p0_v3p0__u_4_4() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__u_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__u_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__u_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__u_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 poison, i32 4, i32 4>
@@ -7808,10 +8082,8 @@ define void @s_shuffle_v3p0_v3p0__0_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7869,10 +8141,8 @@ define void @s_shuffle_v3p0_v3p0__1_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7930,10 +8200,8 @@ define void @s_shuffle_v3p0_v3p0__2_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7946,18 +8214,43 @@ define void @s_shuffle_v3p0_v3p0__2_4_4() {
 }
 
 define void @s_shuffle_v3p0_v3p0__3_4_4() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__3_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__3_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__3_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__3_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 3, i32 4, i32 4>
@@ -7966,20 +8259,48 @@ define void @s_shuffle_v3p0_v3p0__3_4_4() {
 }
 
 define void @s_shuffle_v3p0_v3p0__4_4_4() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 4, i32 4, i32 4>
@@ -7988,20 +8309,48 @@ define void @s_shuffle_v3p0_v3p0__4_4_4() {
 }
 
 define void @s_shuffle_v3p0_v3p0__5_4_4() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 4, i32 4>
@@ -8042,10 +8391,8 @@ define void @s_shuffle_v3p0_v3p0__5_u_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8104,13 +8451,11 @@ define void @s_shuffle_v3p0_v3p0__5_0_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8164,10 +8509,8 @@ define void @s_shuffle_v3p0_v3p0__5_1_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8225,12 +8568,9 @@ define void @s_shuffle_v3p0_v3p0__5_2_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8279,12 +8619,9 @@ define void @s_shuffle_v3p0_v3p0__5_3_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8297,18 +8634,43 @@ define void @s_shuffle_v3p0_v3p0__5_3_4() {
 }
 
 define void @s_shuffle_v3p0_v3p0__u_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__u_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__u_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__u_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__u_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 poison, i32 5, i32 5>
@@ -8362,10 +8724,8 @@ define void @s_shuffle_v3p0_v3p0__0_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8423,10 +8783,8 @@ define void @s_shuffle_v3p0_v3p0__1_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8484,10 +8842,8 @@ define void @s_shuffle_v3p0_v3p0__2_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8500,18 +8856,43 @@ define void @s_shuffle_v3p0_v3p0__2_5_5() {
 }
 
 define void @s_shuffle_v3p0_v3p0__3_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__3_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__3_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__3_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__3_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 3, i32 5, i32 5>
@@ -8520,20 +8901,48 @@ define void @s_shuffle_v3p0_v3p0__3_5_5() {
 }
 
 define void @s_shuffle_v3p0_v3p0__4_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__4_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__4_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__4_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__4_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 4, i32 5, i32 5>
@@ -8542,18 +8951,43 @@ define void @s_shuffle_v3p0_v3p0__4_5_5() {
 }
 
 define void @s_shuffle_v3p0_v3p0__5_u_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_u_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_u_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_u_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_u_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 poison, i32 5>
@@ -8607,10 +9041,8 @@ define void @s_shuffle_v3p0_v3p0__5_0_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8668,10 +9100,8 @@ define void @s_shuffle_v3p0_v3p0__5_1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8729,10 +9159,8 @@ define void @s_shuffle_v3p0_v3p0__5_2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8785,12 +9213,9 @@ define void @s_shuffle_v3p0_v3p0__5_3_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8803,18 +9228,43 @@ define void @s_shuffle_v3p0_v3p0__5_3_5() {
 }
 
 define void @s_shuffle_v3p0_v3p0__5_4_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v3p0__5_4_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_4_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> <i32 5, i32 4, i32 5>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll
index b03066e66cf66..707633944e851 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll
@@ -8016,8 +8016,7 @@ define void @s_shuffle_v3p0_v4p0__1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8057,8 +8056,7 @@ define void @s_shuffle_v3p0_v4p0__2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8102,8 +8100,7 @@ define void @s_shuffle_v3p0_v4p0__3_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8161,8 +8158,7 @@ define void @s_shuffle_v3p0_v4p0__5_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8203,8 +8199,7 @@ define void @s_shuffle_v3p0_v4p0__6_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8249,8 +8244,7 @@ define void @s_shuffle_v3p0_v4p0__7_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8309,10 +8303,8 @@ define void @s_shuffle_v3p0_v4p0__7_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8366,8 +8358,7 @@ define void @s_shuffle_v3p0_v4p0__7_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8425,10 +8416,8 @@ define void @s_shuffle_v3p0_v4p0__7_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8482,10 +8471,8 @@ define void @s_shuffle_v3p0_v4p0__7_3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8534,10 +8521,8 @@ define void @s_shuffle_v3p0_v4p0__7_4_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8550,18 +8535,43 @@ define void @s_shuffle_v3p0_v4p0__7_4_u() {
 }
 
 define void @s_shuffle_v3p0_v4p0__7_5_u() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_5_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 poison>
@@ -8606,10 +8616,8 @@ define void @s_shuffle_v3p0_v4p0__7_6_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8654,10 +8662,8 @@ define void @s_shuffle_v3p0_v4p0__7_7_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8716,13 +8722,11 @@ define void @s_shuffle_v3p0_v4p0__7_7_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8781,13 +8785,11 @@ define void @s_shuffle_v3p0_v4p0__7_7_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8841,10 +8843,8 @@ define void @s_shuffle_v3p0_v4p0__7_7_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8902,12 +8902,9 @@ define void @s_shuffle_v3p0_v4p0__7_7_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8956,12 +8953,9 @@ define void @s_shuffle_v3p0_v4p0__7_7_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9010,12 +9004,9 @@ define void @s_shuffle_v3p0_v4p0__7_7_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9028,20 +9019,48 @@ define void @s_shuffle_v3p0_v4p0__7_7_5() {
 }
 
 define void @s_shuffle_v3p0_v4p0__7_7_6() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_7_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_7_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_7_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 7, i32 6>
@@ -9090,12 +9109,9 @@ define void @s_shuffle_v3p0_v4p0__7_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9144,10 +9160,8 @@ define void @s_shuffle_v3p0_v4p0__u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9159,20 +9173,48 @@ define void @s_shuffle_v3p0_v4p0__u_0_0() {
 }
 
 define void @s_shuffle_v3p0_v4p0__0_0_0() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
@@ -9220,12 +9262,9 @@ define void @s_shuffle_v3p0_v4p0__1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9273,12 +9312,9 @@ define void @s_shuffle_v3p0_v4p0__2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9330,12 +9366,9 @@ define void @s_shuffle_v3p0_v4p0__3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9383,10 +9416,8 @@ define void @s_shuffle_v3p0_v4p0__4_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9444,15 +9475,12 @@ define void @s_shuffle_v3p0_v4p0__5_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9511,13 +9539,11 @@ define void @s_shuffle_v3p0_v4p0__6_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9576,15 +9602,12 @@ define void @s_shuffle_v3p0_v4p0__7_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9639,13 +9662,11 @@ define void @s_shuffle_v3p0_v4p0__7_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9704,15 +9725,12 @@ define void @s_shuffle_v3p0_v4p0__7_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9770,12 +9788,9 @@ define void @s_shuffle_v3p0_v4p0__7_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9833,12 +9848,9 @@ define void @s_shuffle_v3p0_v4p0__7_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9897,15 +9909,12 @@ define void @s_shuffle_v3p0_v4p0__7_4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9963,10 +9972,8 @@ define void @s_shuffle_v3p0_v4p0__7_5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10028,12 +10035,9 @@ define void @s_shuffle_v3p0_v4p0__7_6_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10046,137 +10050,12 @@ define void @s_shuffle_v3p0_v4p0__7_6_0() {
 }
 
 define void @s_shuffle_v3p0_v4p0__u_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 poison, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__0_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__0_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 0, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__1_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__2_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__2_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 2, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__3_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__3_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 3, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__4_1_1() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 4, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__5_1_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_1_1:
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10184,17 +10063,12 @@ define void @s_shuffle_v3p0_v4p0__5_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_1_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10202,42 +10076,30 @@ define void @s_shuffle_v3p0_v4p0__5_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_1_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 poison, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__6_1_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_1_1:
+define void @s_shuffle_v3p0_v4p0__0_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10245,17 +10107,12 @@ define void @s_shuffle_v3p0_v4p0__6_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_1_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10263,42 +10120,32 @@ define void @s_shuffle_v3p0_v4p0__6_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_1_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 0, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_1_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_1:
+define void @s_shuffle_v3p0_v4p0__1_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10306,17 +10153,14 @@ define void @s_shuffle_v3p0_v4p0__7_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s12, s10
 ; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10324,200 +10168,204 @@ define void @s_shuffle_v3p0_v4p0__7_1_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_u_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_1:
+define void @s_shuffle_v3p0_v4p0__2_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 2, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_0_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_1:
+define void @s_shuffle_v3p0_v4p0__3_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 3, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__4_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 4, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_2_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_1:
+define void @s_shuffle_v3p0_v4p0__5_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -10526,61 +10374,57 @@ define void @s_shuffle_v3p0_v4p0__7_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_3_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_1:
+define void @s_shuffle_v3p0_v4p0__6_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -10589,92 +10433,80 @@ define void @s_shuffle_v3p0_v4p0__7_3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_4_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_1:
+define void @s_shuffle_v3p0_v4p0__7_1_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_5_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_1:
+define void @s_shuffle_v3p0_v4p0__7_u_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10692,7 +10524,7 @@ define void @s_shuffle_v3p0_v4p0__7_5_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10710,32 +10542,30 @@ define void @s_shuffle_v3p0_v4p0__7_5_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 1>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_6_1() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_1:
+define void @s_shuffle_v3p0_v4p0__7_0_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10746,8 +10576,8 @@ define void @s_shuffle_v3p0_v4p0__7_6_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -10755,7 +10585,7 @@ define void @s_shuffle_v3p0_v4p0__7_6_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_1:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10766,8 +10596,8 @@ define void @s_shuffle_v3p0_v4p0__7_6_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -10775,190 +10605,131 @@ define void @s_shuffle_v3p0_v4p0__7_6_1() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_1:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__u_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 poison, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__0_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__0_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 0, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__1_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__1_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 1, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__2_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__3_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__3_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 3, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__4_2_2() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+define void @s_shuffle_v3p0_v4p0__7_2_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 4, i32 2, i32 2>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__5_2_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_2_2:
+define void @s_shuffle_v3p0_v4p0__7_3_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_2_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_2_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -10967,295 +10738,1418 @@ define void @s_shuffle_v3p0_v4p0__5_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__6_2_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_2_2:
+define void @s_shuffle_v3p0_v4p0__7_4_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_2_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_2_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 1>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_2_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_2:
+define void @s_shuffle_v3p0_v4p0__7_5_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_6_1() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__u_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 poison, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__0_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 0, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__1_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 1, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__2_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 2, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__3_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 3, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__4_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 4, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__5_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__6_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_2_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_u_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_0_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_1_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_3_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_4_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_5_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__7_6_2() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 2>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__u_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 poison, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__0_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 0, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__1_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 1, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__2_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 2, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_u_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_2:
+define void @s_shuffle_v3p0_v4p0__3_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_0_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_2:
+define void @s_shuffle_v3p0_v4p0__4_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 4, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_1_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_2:
+define void @s_shuffle_v3p0_v4p0__5_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11264,21 +12158,22 @@ define void @s_shuffle_v3p0_v4p0__7_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_3_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_2:
+define void @s_shuffle_v3p0_v4p0__6_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11287,16 +12182,16 @@ define void @s_shuffle_v3p0_v4p0__7_3_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s10, s14
 ; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11305,16 +12200,16 @@ define void @s_shuffle_v3p0_v4p0__7_3_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s10, s14
 ; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11323,23 +12218,22 @@ define void @s_shuffle_v3p0_v4p0__7_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_4_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_2:
+define void @s_shuffle_v3p0_v4p0__7_3_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11350,14 +12244,16 @@ define void @s_shuffle_v3p0_v4p0__7_4_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11368,14 +12264,16 @@ define void @s_shuffle_v3p0_v4p0__7_4_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11384,59 +12282,58 @@ define void @s_shuffle_v3p0_v4p0__7_4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_5_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_2:
+define void @s_shuffle_v3p0_v4p0__7_u_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11445,59 +12342,61 @@ define void @s_shuffle_v3p0_v4p0__7_5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_6_2() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_2:
+define void @s_shuffle_v3p0_v4p0__7_0_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_2:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_2:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -11506,30 +12405,32 @@ define void @s_shuffle_v3p0_v4p0__7_6_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__u_3_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_3_3:
+define void @s_shuffle_v3p0_v4p0__7_1_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11537,14 +12438,17 @@ define void @s_shuffle_v3p0_v4p0__u_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_3_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11552,58 +12456,42 @@ define void @s_shuffle_v3p0_v4p0__u_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_3_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 poison, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__0_3_3() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__0_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 0, i32 3, i32 3>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__1_3_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_3_3:
+define void @s_shuffle_v3p0_v4p0__7_2_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11611,16 +12499,19 @@ define void @s_shuffle_v3p0_v4p0__1_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_3_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11628,39 +12519,43 @@ define void @s_shuffle_v3p0_v4p0__1_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_3_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 1, i32 3, i32 3>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__2_3_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_3_3:
+define void @s_shuffle_v3p0_v4p0__7_4_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s12
-; GFX900-NEXT:    s_mov_b32 s9, s13
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11668,16 +12563,19 @@ define void @s_shuffle_v3p0_v4p0__2_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_3_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s12
-; GFX90A-NEXT:    s_mov_b32 s9, s13
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11685,94 +12583,102 @@ define void @s_shuffle_v3p0_v4p0__2_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_3_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 2, i32 3, i32 3>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__3_3_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_3_3:
+define void @s_shuffle_v3p0_v4p0__7_5_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s14
 ; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_3_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_3_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 3, i32 3, i32 3>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__4_3_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_3_3:
+define void @s_shuffle_v3p0_v4p0__7_6_3() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -11780,14 +12686,19 @@ define void @s_shuffle_v3p0_v4p0__4_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_3_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11795,649 +12706,531 @@ define void @s_shuffle_v3p0_v4p0__4_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_3_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 4, i32 3, i32 3>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 3>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__5_3_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_3_3:
+define void @s_shuffle_v3p0_v4p0__u_4_4() {
+; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_4_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:13]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 poison, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__0_4_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_3_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_3_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 0, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__6_3_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_3_3:
+define void @s_shuffle_v3p0_v4p0__1_4_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_3_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_3_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 1, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_3_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_3:
+define void @s_shuffle_v3p0_v4p0__2_4_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_3:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 2, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_u_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_3:
+define void @s_shuffle_v3p0_v4p0__3_4_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 3, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_0_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_3:
+define void @s_shuffle_v3p0_v4p0__4_4_4() {
+; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_4_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:13]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__5_4_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_1_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_3:
+define void @s_shuffle_v3p0_v4p0__6_4_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_2_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_3:
+define void @s_shuffle_v3p0_v4p0__7_4_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_4_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_3:
+define void @s_shuffle_v3p0_v4p0__7_u_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_5_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_3:
+define void @s_shuffle_v3p0_v4p0__7_0_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_6_3() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_3:
+define void @s_shuffle_v3p0_v4p0__7_1_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_3:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_3:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -12446,831 +13239,783 @@ define void @s_shuffle_v3p0_v4p0__7_6_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__u_4_4() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 poison, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__0_4_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_4_4:
+define void @s_shuffle_v3p0_v4p0__7_2_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_4_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_4_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 0, i32 4, i32 4>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__1_4_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_4_4:
+define void @s_shuffle_v3p0_v4p0__7_3_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_4_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_4_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 1, i32 4, i32 4>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__2_4_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_4_4:
+define void @s_shuffle_v3p0_v4p0__7_5_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_4_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_4_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 2, i32 4, i32 4>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__3_4_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_4_4:
+define void @s_shuffle_v3p0_v4p0__7_6_4() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_4_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_4_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 3, i32 4, i32 4>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__4_4_4() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> <i32 4, i32 4, i32 4>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 4>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__5_4_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_4_4:
+define void @s_shuffle_v3p0_v4p0__u_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_4_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_4_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 poison, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__6_4_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_4_4:
+define void @s_shuffle_v3p0_v4p0__0_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_4_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_4_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 0, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_4_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_4:
+define void @s_shuffle_v3p0_v4p0__1_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 1, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_u_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_4:
+define void @s_shuffle_v3p0_v4p0__2_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 2, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_0_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_4:
+define void @s_shuffle_v3p0_v4p0__3_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 3, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_1_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_4:
+define void @s_shuffle_v3p0_v4p0__4_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 4, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_2_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_4:
+define void @s_shuffle_v3p0_v4p0__5_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_3_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_4:
+define void @s_shuffle_v3p0_v4p0__6_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_5_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_4:
+define void @s_shuffle_v3p0_v4p0__7_5_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s6
-; GFX900-NEXT:    s_mov_b32 s11, s7
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s6
-; GFX90A-NEXT:    s_mov_b32 s11, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_6_4() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_4:
+define void @s_shuffle_v3p0_v4p0__7_u_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_4:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_4:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 4>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__u_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 poison, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__0_5_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_5_5:
+define void @s_shuffle_v3p0_v4p0__7_0_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13278,17 +14023,19 @@ define void @s_shuffle_v3p0_v4p0__0_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_5_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13296,68 +14043,68 @@ define void @s_shuffle_v3p0_v4p0__0_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_5_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 0, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__1_5_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_5_5:
+define void @s_shuffle_v3p0_v4p0__7_1_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_5_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_5_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -13366,231 +14113,145 @@ define void @s_shuffle_v3p0_v4p0__1_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 1, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__2_5_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_5_5:
+define void @s_shuffle_v3p0_v4p0__7_2_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_5_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_5_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 2, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__3_5_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_5_5:
+define void @s_shuffle_v3p0_v4p0__7_3_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_5_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_5_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 3, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__4_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 4, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__5_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__6_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__6_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__7_5_5() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_u_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_5:
+define void @s_shuffle_v3p0_v4p0__7_4_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13598,6 +14259,8 @@ define void @s_shuffle_v3p0_v4p0__7_u_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13605,7 +14268,7 @@ define void @s_shuffle_v3p0_v4p0__7_u_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13613,6 +14276,8 @@ define void @s_shuffle_v3p0_v4p0__7_u_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13620,41 +14285,37 @@ define void @s_shuffle_v3p0_v4p0__7_u_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 5>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_0_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_5:
+define void @s_shuffle_v3p0_v4p0__7_6_5() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13662,19 +14323,16 @@ define void @s_shuffle_v3p0_v4p0__7_0_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13682,35 +14340,73 @@ define void @s_shuffle_v3p0_v4p0__7_0_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_0_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p0_v4p0__u_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 0, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 poison, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_1_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_5:
+define void @s_shuffle_v3p0_v4p0__0_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -13719,16 +14415,16 @@ define void @s_shuffle_v3p0_v4p0__7_1_5() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -13737,16 +14433,16 @@ define void @s_shuffle_v3p0_v4p0__7_1_5() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_1_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -13755,360 +14451,300 @@ define void @s_shuffle_v3p0_v4p0__7_1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 1, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 0, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_2_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_5:
+define void @s_shuffle_v3p0_v4p0__1_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_2_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 2, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 1, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_3_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_5:
+define void @s_shuffle_v3p0_v4p0__2_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_3_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 3, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 2, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_4_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_5:
+define void @s_shuffle_v3p0_v4p0__3_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 4, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 3, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__7_6_5() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_5:
+define void @s_shuffle_v3p0_v4p0__4_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_5:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_5:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__u_6_6() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__u_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 poison, i32 6, i32 6>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 4, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__0_6_6() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_6_6:
+define void @s_shuffle_v3p0_v4p0__5_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_6_6:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__0_6_6:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 0, i32 6, i32 6>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__1_6_6() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_6_6:
+define void @s_shuffle_v3p0_v4p0__6_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -14116,17 +14752,14 @@ define void @s_shuffle_v3p0_v4p0__1_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_6_6:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -14134,42 +14767,34 @@ define void @s_shuffle_v3p0_v4p0__1_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_6_6:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 1, i32 6, i32 6>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__2_6_6() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_6_6:
+define void @s_shuffle_v3p0_v4p0__7_6_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_6:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -14177,17 +14802,14 @@ define void @s_shuffle_v3p0_v4p0__2_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_6_6:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -14195,190 +14817,63 @@ define void @s_shuffle_v3p0_v4p0__2_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_6_6:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 2, i32 6, i32 6>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v3p0_v4p0__3_6_6() {
-; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_6_6:
+define void @s_shuffle_v3p0_v4p0__7_u_6() {
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_6_6:
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_6_6:
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 3, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__4_6_6() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 4, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__5_6_6() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__5_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 5, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__6_6_6() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__7_6_6() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:13]}"(<3 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p0_v4p0__7_u_6() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_u_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 poison, i32 6>
@@ -14432,10 +14927,8 @@ define void @s_shuffle_v3p0_v4p0__7_0_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14493,10 +14986,8 @@ define void @s_shuffle_v3p0_v4p0__7_1_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14554,10 +15045,8 @@ define void @s_shuffle_v3p0_v4p0__7_2_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14615,10 +15104,8 @@ define void @s_shuffle_v3p0_v4p0__7_3_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14671,12 +15158,9 @@ define void @s_shuffle_v3p0_v4p0__7_4_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14689,18 +15173,43 @@ define void @s_shuffle_v3p0_v4p0__7_4_6() {
 }
 
 define void @s_shuffle_v3p0_v4p0__7_5_6() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_5_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 6>
@@ -14745,10 +15254,8 @@ define void @s_shuffle_v3p0_v4p0__u_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14806,10 +15313,8 @@ define void @s_shuffle_v3p0_v4p0__0_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14871,12 +15376,9 @@ define void @s_shuffle_v3p0_v4p0__1_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14934,12 +15436,9 @@ define void @s_shuffle_v3p0_v4p0__2_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15001,12 +15500,9 @@ define void @s_shuffle_v3p0_v4p0__3_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15019,20 +15515,48 @@ define void @s_shuffle_v3p0_v4p0__3_7_7() {
 }
 
 define void @s_shuffle_v3p0_v4p0__4_7_7() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__4_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 4, i32 7, i32 7>
@@ -15081,12 +15605,9 @@ define void @s_shuffle_v3p0_v4p0__5_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15139,12 +15660,9 @@ define void @s_shuffle_v3p0_v4p0__6_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15193,10 +15711,8 @@ define void @s_shuffle_v3p0_v4p0__7_u_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15258,12 +15774,9 @@ define void @s_shuffle_v3p0_v4p0__7_0_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15321,10 +15834,8 @@ define void @s_shuffle_v3p0_v4p0__7_1_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15386,12 +15897,9 @@ define void @s_shuffle_v3p0_v4p0__7_2_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15449,12 +15957,9 @@ define void @s_shuffle_v3p0_v4p0__7_3_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15507,12 +16012,9 @@ define void @s_shuffle_v3p0_v4p0__7_4_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15525,20 +16027,48 @@ define void @s_shuffle_v3p0_v4p0__7_4_7() {
 }
 
 define void @s_shuffle_v3p0_v4p0__7_5_7() {
-; GFX9-LABEL: s_shuffle_v3p0_v4p0__7_5_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_5_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> <i32 7, i32 5, i32 7>
@@ -15587,12 +16117,9 @@ define void @s_shuffle_v3p0_v4p0__7_6_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:13]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll
index ac7d9557ce765..3dc06c075b039 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll
@@ -3734,8 +3734,7 @@ define void @s_shuffle_v4i64_v2i64__1_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3793,8 +3792,7 @@ define void @s_shuffle_v4i64_v2i64__3_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3852,10 +3850,8 @@ define void @s_shuffle_v4i64_v2i64__3_0_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3909,8 +3905,7 @@ define void @s_shuffle_v4i64_v2i64__3_1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3959,10 +3954,8 @@ define void @s_shuffle_v4i64_v2i64__3_2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3975,18 +3968,43 @@ define void @s_shuffle_v4i64_v2i64__3_2_u_u() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_u_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_u_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_u_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_u_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 poison>
@@ -3995,21 +4013,52 @@ define void @s_shuffle_v4i64_v2i64__3_3_u_u() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_0_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_0_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_0_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_0_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_0_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 poison>
@@ -4063,10 +4112,8 @@ define void @s_shuffle_v4i64_v2i64__3_3_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4079,20 +4126,48 @@ define void @s_shuffle_v4i64_v2i64__3_3_1_u() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_2_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_2_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_2_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_2_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_2_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 2, i32 poison>
@@ -4101,20 +4176,48 @@ define void @s_shuffle_v4i64_v2i64__3_3_2_u() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_3_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_3_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_3_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_3_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_3_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 3, i32 poison>
@@ -4172,12 +4275,9 @@ define void @s_shuffle_v4i64_v2i64__3_3_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4190,23 +4290,57 @@ define void @s_shuffle_v4i64_v2i64__3_3_3_0() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_3_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_3_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_3_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_3_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_3_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 3, i32 1>
@@ -4259,14 +4393,10 @@ define void @s_shuffle_v4i64_v2i64__3_3_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4279,44 +4409,103 @@ define void @s_shuffle_v4i64_v2i64__3_3_3_2() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v2i64__u_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__u_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %vec1 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v2i64__u_0_0_0() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__u_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__u_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__u_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 poison, i32 0, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -4324,22 +4513,53 @@ define void @s_shuffle_v4i64_v2i64__u_0_0_0() {
 }
 
 define void @s_shuffle_v4i64_v2i64__0_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__0_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__0_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    s_mov_b32 s14, s8
+; GFX900-NEXT:    s_mov_b32 s15, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__0_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    s_mov_b32 s14, s8
+; GFX90A-NEXT:    s_mov_b32 s15, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__0_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -4347,22 +4567,53 @@ define void @s_shuffle_v4i64_v2i64__0_0_0_0() {
 }
 
 define void @s_shuffle_v4i64_v2i64__1_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__1_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__1_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__1_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__1_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -4370,20 +4621,48 @@ define void @s_shuffle_v4i64_v2i64__1_0_0_0() {
 }
 
 define void @s_shuffle_v4i64_v2i64__2_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__2_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__2_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__2_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__2_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -4440,12 +4719,9 @@ define void @s_shuffle_v4i64_v2i64__3_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4503,10 +4779,8 @@ define void @s_shuffle_v4i64_v2i64__3_u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4568,12 +4842,9 @@ define void @s_shuffle_v4i64_v2i64__3_1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4635,12 +4906,9 @@ define void @s_shuffle_v4i64_v2i64__3_2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4653,57 +4921,91 @@ define void @s_shuffle_v4i64_v2i64__3_2_0_0() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 0>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v2i64__3_3_u_0() {
-; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_u_0:
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_0_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ; def s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_u_0:
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_0_0:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ; def s[12:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %vec1 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v2i64__3_3_u_0() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_u_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_u_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
@@ -4723,10 +5025,8 @@ define void @s_shuffle_v4i64_v2i64__3_3_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4788,12 +5088,9 @@ define void @s_shuffle_v4i64_v2i64__3_3_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4855,12 +5152,9 @@ define void @s_shuffle_v4i64_v2i64__3_3_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4873,20 +5167,48 @@ define void @s_shuffle_v4i64_v2i64__3_3_2_0() {
 }
 
 define void @s_shuffle_v4i64_v2i64__u_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__u_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__u_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__u_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__u_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -4894,20 +5216,48 @@ define void @s_shuffle_v4i64_v2i64__u_1_1_1() {
 }
 
 define void @s_shuffle_v4i64_v2i64__0_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__0_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__0_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__0_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__0_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -4915,22 +5265,53 @@ define void @s_shuffle_v4i64_v2i64__0_1_1_1() {
 }
 
 define void @s_shuffle_v4i64_v2i64__1_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__1_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__1_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__1_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__1_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -4938,20 +5319,48 @@ define void @s_shuffle_v4i64_v2i64__1_1_1_1() {
 }
 
 define void @s_shuffle_v4i64_v2i64__2_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__2_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__2_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__2_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__2_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -5008,12 +5417,9 @@ define void @s_shuffle_v4i64_v2i64__3_1_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5071,10 +5477,8 @@ define void @s_shuffle_v4i64_v2i64__3_u_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5136,12 +5540,9 @@ define void @s_shuffle_v4i64_v2i64__3_0_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5203,12 +5604,9 @@ define void @s_shuffle_v4i64_v2i64__3_2_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5221,46 +5619,111 @@ define void @s_shuffle_v4i64_v2i64__3_2_1_1() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %vec1 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
 
 define void @s_shuffle_v4i64_v2i64__3_3_u_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_u_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_u_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_u_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_u_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 1>
@@ -5269,21 +5732,52 @@ define void @s_shuffle_v4i64_v2i64__3_3_u_1() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_0_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_0_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_0_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_0_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_0_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 1>
@@ -5341,12 +5835,9 @@ define void @s_shuffle_v4i64_v2i64__3_3_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5445,8 +5936,7 @@ define void @s_shuffle_v4i64_v2i64__1_2_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5472,22 +5962,53 @@ define void @s_shuffle_v4i64_v2i64__2_2_2_2() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
@@ -5496,20 +6017,48 @@ define void @s_shuffle_v4i64_v2i64__3_2_2_2() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_u_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_u_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_u_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_u_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_u_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 poison, i32 2, i32 2>
@@ -5567,12 +6116,9 @@ define void @s_shuffle_v4i64_v2i64__3_0_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5585,23 +6131,57 @@ define void @s_shuffle_v4i64_v2i64__3_0_2_2() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_1_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_1_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_1_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_1_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_1_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 1, i32 2, i32 2>
@@ -5610,22 +6190,53 @@ define void @s_shuffle_v4i64_v2i64__3_1_2_2() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 2, i32 2>
@@ -5674,12 +6285,9 @@ define void @s_shuffle_v4i64_v2i64__3_3_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5741,12 +6349,9 @@ define void @s_shuffle_v4i64_v2i64__3_3_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5812,14 +6417,10 @@ define void @s_shuffle_v4i64_v2i64__3_3_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5832,20 +6433,48 @@ define void @s_shuffle_v4i64_v2i64__3_3_1_2() {
 }
 
 define void @s_shuffle_v4i64_v2i64__u_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__u_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__u_3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__u_3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__u_3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 poison, i32 3, i32 3, i32 3>
@@ -5854,23 +6483,57 @@ define void @s_shuffle_v4i64_v2i64__u_3_3_3() {
 }
 
 define void @s_shuffle_v4i64_v2i64__0_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__0_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__0_3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__0_3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__0_3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 0, i32 3, i32 3, i32 3>
@@ -5928,12 +6591,9 @@ define void @s_shuffle_v4i64_v2i64__1_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5946,78 +6606,134 @@ define void @s_shuffle_v4i64_v2i64__1_3_3_3() {
 }
 
 define void @s_shuffle_v4i64_v2i64__2_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__2_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v2i64__3_u_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_u_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i64> asm "; def $0", "=s"()
-  %vec1 = call <2 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 poison, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v2i64__3_0_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_0_3_3:
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__2_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ; def s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_0_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__2_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ; def s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__2_3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %vec1 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v2i64__3_u_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_u_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_u_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_u_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i64> asm "; def $0", "=s"()
+  %vec1 = call <2 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 poison, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v2i64__3_0_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_0_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_0_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
@@ -6039,12 +6755,9 @@ define void @s_shuffle_v4i64_v2i64__3_0_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6057,23 +6770,57 @@ define void @s_shuffle_v4i64_v2i64__3_0_3_3() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_1_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_1_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_1_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_1_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_1_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 1, i32 3, i32 3>
@@ -6082,22 +6829,53 @@ define void @s_shuffle_v4i64_v2i64__3_1_3_3() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_2_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_2_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_2_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_2_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_2_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 2, i32 3, i32 3>
@@ -6106,20 +6884,48 @@ define void @s_shuffle_v4i64_v2i64__3_2_3_3() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_u_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_u_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_u_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_u_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_u_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 3>
@@ -6128,23 +6934,57 @@ define void @s_shuffle_v4i64_v2i64__3_3_u_3() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_0_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_0_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_0_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_0_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_0_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 3>
@@ -6202,12 +7042,9 @@ define void @s_shuffle_v4i64_v2i64__3_3_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6220,20 +7057,48 @@ define void @s_shuffle_v4i64_v2i64__3_3_1_3() {
 }
 
 define void @s_shuffle_v4i64_v2i64__3_3_2_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v2i64__3_3_2_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_2_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_2_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_2_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i64> asm "; def $0", "=s"()
   %vec1 = call <2 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> <i32 3, i32 3, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
index 8dd4a40d00680..1a295a4c6e8ed 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
@@ -7818,8 +7818,7 @@ define void @s_shuffle_v4i64_v3i64__1_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7859,8 +7858,7 @@ define void @s_shuffle_v4i64_v3i64__2_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7918,8 +7916,7 @@ define void @s_shuffle_v4i64_v3i64__4_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7960,8 +7957,7 @@ define void @s_shuffle_v4i64_v3i64__5_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8016,11 +8012,11 @@ define void @s_shuffle_v4i64_v3i64__5_0_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8070,8 +8066,7 @@ define void @s_shuffle_v4i64_v3i64__5_1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8125,10 +8120,8 @@ define void @s_shuffle_v4i64_v3i64__5_2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8173,10 +8166,8 @@ define void @s_shuffle_v4i64_v3i64__5_3_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8189,18 +8180,43 @@ define void @s_shuffle_v4i64_v3i64__5_3_u_u() {
 }
 
 define void @s_shuffle_v4i64_v3i64__5_4_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_4_u_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_u_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_u_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_4_u_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 4, i32 poison, i32 poison>
@@ -8245,10 +8261,8 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8310,12 +8324,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8377,12 +8388,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8440,10 +8448,8 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8492,12 +8498,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8550,12 +8553,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8568,20 +8568,48 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_u() {
 }
 
 define void @s_shuffle_v4i64_v3i64__5_5_5_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_5_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_5_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_5_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_5_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 poison>
@@ -8639,12 +8667,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8706,12 +8731,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8773,12 +8795,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8835,14 +8854,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8895,14 +8910,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8915,22 +8926,53 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_4() {
 }
 
 define void @s_shuffle_v4i64_v3i64__5_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
@@ -8979,12 +9021,9 @@ define void @s_shuffle_v4i64_v3i64__u_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8996,22 +9035,53 @@ define void @s_shuffle_v4i64_v3i64__u_0_0_0() {
 }
 
 define void @s_shuffle_v4i64_v3i64__0_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__0_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    s_mov_b32 s14, s8
+; GFX900-NEXT:    s_mov_b32 s15, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    s_mov_b32 s14, s8
+; GFX90A-NEXT:    s_mov_b32 s15, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__0_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -9063,14 +9133,10 @@ define void @s_shuffle_v4i64_v3i64__1_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9122,14 +9188,10 @@ define void @s_shuffle_v4i64_v3i64__2_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9181,12 +9243,9 @@ define void @s_shuffle_v4i64_v3i64__3_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9248,17 +9307,13 @@ define void @s_shuffle_v4i64_v3i64__4_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9321,15 +9376,12 @@ define void @s_shuffle_v4i64_v3i64__5_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9388,13 +9440,11 @@ define void @s_shuffle_v4i64_v3i64__5_u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9457,15 +9507,12 @@ define void @s_shuffle_v4i64_v3i64__5_1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9527,14 +9574,10 @@ define void @s_shuffle_v4i64_v3i64__5_2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9597,15 +9640,12 @@ define void @s_shuffle_v4i64_v3i64__5_3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9667,12 +9707,9 @@ define void @s_shuffle_v4i64_v3i64__5_4_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9738,14 +9775,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9807,12 +9840,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9878,14 +9908,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9951,14 +9977,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10020,12 +10042,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10091,14 +10110,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10111,126 +10126,12 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_0() {
 }
 
 define void @s_shuffle_v4i64_v3i64__u_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v3i64__0_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__0_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v3i64__1_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__1_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v3i64__2_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__2_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v3i64__3_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v3i64__4_1_1_1() {
-; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_1_1_1:
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    s_mov_b32 s14, s10
@@ -10240,13 +10141,273 @@ define void @s_shuffle_v4i64_v3i64__4_1_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_1_1_1:
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__u_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v3i64__0_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__0_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v3i64__1_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__1_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v3i64__2_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__2_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v3i64__3_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__3_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v3i64__4_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:9]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s6
@@ -10269,12 +10430,9 @@ define void @s_shuffle_v4i64_v3i64__4_1_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10332,12 +10490,9 @@ define void @s_shuffle_v4i64_v3i64__5_1_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10396,13 +10551,11 @@ define void @s_shuffle_v4i64_v3i64__5_u_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10465,15 +10618,12 @@ define void @s_shuffle_v4i64_v3i64__5_0_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10531,14 +10681,10 @@ define void @s_shuffle_v4i64_v3i64__5_2_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10601,15 +10747,12 @@ define void @s_shuffle_v4i64_v3i64__5_3_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10671,12 +10814,9 @@ define void @s_shuffle_v4i64_v3i64__5_4_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10742,14 +10882,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10811,12 +10947,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10882,14 +11015,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10951,14 +11080,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11020,12 +11145,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11091,14 +11213,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11111,20 +11229,48 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_1() {
 }
 
 define void @s_shuffle_v4i64_v3i64__u_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__u_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 poison, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -11132,20 +11278,48 @@ define void @s_shuffle_v4i64_v3i64__u_2_2_2() {
 }
 
 define void @s_shuffle_v4i64_v3i64__0_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__0_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__0_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -11153,22 +11327,53 @@ define void @s_shuffle_v4i64_v3i64__0_2_2_2() {
 }
 
 define void @s_shuffle_v4i64_v3i64__1_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__1_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__1_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -11176,22 +11381,53 @@ define void @s_shuffle_v4i64_v3i64__1_2_2_2() {
 }
 
 define void @s_shuffle_v4i64_v3i64__2_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__2_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__2_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -11199,20 +11435,48 @@ define void @s_shuffle_v4i64_v3i64__2_2_2_2() {
 }
 
 define void @s_shuffle_v4i64_v3i64__3_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__3_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -11269,12 +11533,9 @@ define void @s_shuffle_v4i64_v3i64__4_2_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11332,12 +11593,9 @@ define void @s_shuffle_v4i64_v3i64__5_2_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11391,10 +11649,8 @@ define void @s_shuffle_v4i64_v3i64__5_u_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11456,14 +11712,10 @@ define void @s_shuffle_v4i64_v3i64__5_0_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11517,10 +11769,8 @@ define void @s_shuffle_v4i64_v3i64__5_1_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11578,12 +11828,9 @@ define void @s_shuffle_v4i64_v3i64__5_3_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11645,12 +11892,9 @@ define void @s_shuffle_v4i64_v3i64__5_4_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11712,12 +11956,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11779,12 +12020,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11846,14 +12084,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11919,14 +12153,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11992,12 +12222,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12063,14 +12290,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12169,8 +12392,7 @@ define void @s_shuffle_v4i64_v3i64__1_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12210,8 +12432,7 @@ define void @s_shuffle_v4i64_v3i64__2_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12281,14 +12502,10 @@ define void @s_shuffle_v4i64_v3i64__4_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12341,14 +12558,10 @@ define void @s_shuffle_v4i64_v3i64__5_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12397,12 +12610,9 @@ define void @s_shuffle_v4i64_v3i64__5_u_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12465,15 +12675,12 @@ define void @s_shuffle_v4i64_v3i64__5_0_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12531,12 +12738,9 @@ define void @s_shuffle_v4i64_v3i64__5_1_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12598,14 +12802,10 @@ define void @s_shuffle_v4i64_v3i64__5_2_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:21]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s20
-; GFX942-NEXT:    s_mov_b32 s9, s21
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s16
-; GFX942-NEXT:    s_mov_b32 s13, s17
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12658,14 +12858,10 @@ define void @s_shuffle_v4i64_v3i64__5_4_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12722,14 +12918,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12782,12 +12974,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12853,14 +13042,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:21]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s20
-; GFX942-NEXT:    s_mov_b32 s9, s21
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12926,14 +13111,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:21]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s20
-; GFX942-NEXT:    s_mov_b32 s9, s21
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12995,12 +13176,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13057,14 +13235,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13077,20 +13251,48 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_3() {
 }
 
 define void @s_shuffle_v4i64_v3i64__u_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__u_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 poison, i32 4, i32 4, i32 4>
@@ -13144,12 +13346,9 @@ define void @s_shuffle_v4i64_v3i64__0_4_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13211,12 +13410,9 @@ define void @s_shuffle_v4i64_v3i64__1_4_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13278,12 +13474,9 @@ define void @s_shuffle_v4i64_v3i64__2_4_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13296,20 +13489,48 @@ define void @s_shuffle_v4i64_v3i64__2_4_4_4() {
 }
 
 define void @s_shuffle_v4i64_v3i64__3_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__3_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 3, i32 4, i32 4, i32 4>
@@ -13318,22 +13539,53 @@ define void @s_shuffle_v4i64_v3i64__3_4_4_4() {
 }
 
 define void @s_shuffle_v4i64_v3i64__4_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__4_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__4_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
@@ -13342,22 +13594,53 @@ define void @s_shuffle_v4i64_v3i64__4_4_4_4() {
 }
 
 define void @s_shuffle_v4i64_v3i64__5_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 4, i32 4, i32 4>
@@ -13402,12 +13685,9 @@ define void @s_shuffle_v4i64_v3i64__5_u_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13466,15 +13746,12 @@ define void @s_shuffle_v4i64_v3i64__5_0_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13532,12 +13809,9 @@ define void @s_shuffle_v4i64_v3i64__5_1_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13599,12 +13873,9 @@ define void @s_shuffle_v4i64_v3i64__5_2_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13657,14 +13928,10 @@ define void @s_shuffle_v4i64_v3i64__5_3_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13717,14 +13984,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13773,12 +14036,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13840,12 +14100,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13907,12 +14164,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13974,12 +14228,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14028,14 +14279,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14048,20 +14295,48 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_4() {
 }
 
 define void @s_shuffle_v4i64_v3i64__u_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__u_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 poison, i32 5, i32 5, i32 5>
@@ -14119,12 +14394,9 @@ define void @s_shuffle_v4i64_v3i64__0_5_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14186,12 +14458,9 @@ define void @s_shuffle_v4i64_v3i64__1_5_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14253,12 +14522,9 @@ define void @s_shuffle_v4i64_v3i64__2_5_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14271,20 +14537,48 @@ define void @s_shuffle_v4i64_v3i64__2_5_5_5() {
 }
 
 define void @s_shuffle_v4i64_v3i64__3_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__3_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 3, i32 5, i32 5, i32 5>
@@ -14293,22 +14587,53 @@ define void @s_shuffle_v4i64_v3i64__3_5_5_5() {
 }
 
 define void @s_shuffle_v4i64_v3i64__4_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__4_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__4_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 4, i32 5, i32 5, i32 5>
@@ -14317,20 +14642,48 @@ define void @s_shuffle_v4i64_v3i64__4_5_5_5() {
 }
 
 define void @s_shuffle_v4i64_v3i64__5_u_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_u_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_u_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 poison, i32 5, i32 5>
@@ -14388,12 +14741,9 @@ define void @s_shuffle_v4i64_v3i64__5_0_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14455,12 +14805,9 @@ define void @s_shuffle_v4i64_v3i64__5_1_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14522,12 +14869,9 @@ define void @s_shuffle_v4i64_v3i64__5_2_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14584,14 +14928,10 @@ define void @s_shuffle_v4i64_v3i64__5_3_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14604,20 +14944,48 @@ define void @s_shuffle_v4i64_v3i64__5_3_5_5() {
 }
 
 define void @s_shuffle_v4i64_v3i64__5_4_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_4_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_4_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i64> asm "; def $0", "=s"()
   %vec1 = call <3 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> <i32 5, i32 4, i32 5, i32 5>
@@ -14666,12 +15034,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14737,14 +15102,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14810,14 +15171,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14879,12 +15236,9 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14937,14 +15291,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15001,14 +15351,10 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll
index ea9ef2f1ac94a..d026e3b08b171 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll
@@ -13289,8 +13289,7 @@ define void @s_shuffle_v4i64_v4i64__1_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13330,8 +13329,7 @@ define void @s_shuffle_v4i64_v4i64__2_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13375,8 +13373,7 @@ define void @s_shuffle_v4i64_v4i64__3_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13434,8 +13431,7 @@ define void @s_shuffle_v4i64_v4i64__5_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13476,8 +13472,7 @@ define void @s_shuffle_v4i64_v4i64__6_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13522,8 +13517,7 @@ define void @s_shuffle_v4i64_v4i64__7_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13582,10 +13576,8 @@ define void @s_shuffle_v4i64_v4i64__7_0_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13639,8 +13631,7 @@ define void @s_shuffle_v4i64_v4i64__7_1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13698,10 +13689,8 @@ define void @s_shuffle_v4i64_v4i64__7_2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13755,10 +13744,8 @@ define void @s_shuffle_v4i64_v4i64__7_3_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13807,10 +13794,8 @@ define void @s_shuffle_v4i64_v4i64__7_4_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13823,18 +13808,43 @@ define void @s_shuffle_v4i64_v4i64__7_4_u_u() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_5_u_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_5_u_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_u_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_u_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_u_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 poison, i32 poison>
@@ -13879,10 +13889,8 @@ define void @s_shuffle_v4i64_v4i64__7_6_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13927,10 +13935,8 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13989,13 +13995,11 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14054,13 +14058,11 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14114,10 +14116,8 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14175,12 +14175,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14229,12 +14226,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14283,12 +14277,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14301,20 +14292,48 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_u() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_7_6_u() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_6_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 poison>
@@ -14363,12 +14382,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14434,14 +14450,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14507,14 +14519,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14580,14 +14588,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14649,12 +14653,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14711,14 +14712,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14771,14 +14768,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14835,14 +14828,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14855,22 +14844,53 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_6() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -14919,12 +14939,9 @@ define void @s_shuffle_v4i64_v4i64__u_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14936,22 +14953,53 @@ define void @s_shuffle_v4i64_v4i64__u_0_0_0() {
 }
 
 define void @s_shuffle_v4i64_v4i64__0_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__0_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    s_mov_b32 s14, s8
+; GFX900-NEXT:    s_mov_b32 s15, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    s_mov_b32 s14, s8
+; GFX90A-NEXT:    s_mov_b32 s15, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -15003,14 +15051,10 @@ define void @s_shuffle_v4i64_v4i64__1_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15062,14 +15106,10 @@ define void @s_shuffle_v4i64_v4i64__2_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15125,14 +15165,10 @@ define void @s_shuffle_v4i64_v4i64__3_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15184,12 +15220,9 @@ define void @s_shuffle_v4i64_v4i64__4_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15251,17 +15284,13 @@ define void @s_shuffle_v4i64_v4i64__5_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15324,15 +15353,12 @@ define void @s_shuffle_v4i64_v4i64__6_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15395,17 +15421,13 @@ define void @s_shuffle_v4i64_v4i64__7_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15464,15 +15486,12 @@ define void @s_shuffle_v4i64_v4i64__7_u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15535,17 +15554,13 @@ define void @s_shuffle_v4i64_v4i64__7_1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15611,14 +15626,10 @@ define void @s_shuffle_v4i64_v4i64__7_2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15680,14 +15691,10 @@ define void @s_shuffle_v4i64_v4i64__7_3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15750,17 +15757,13 @@ define void @s_shuffle_v4i64_v4i64__7_4_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15822,12 +15825,9 @@ define void @s_shuffle_v4i64_v4i64__7_5_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15893,14 +15893,10 @@ define void @s_shuffle_v4i64_v4i64__7_6_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15963,15 +15959,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16030,13 +16023,11 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s0
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16099,15 +16090,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16169,14 +16157,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16238,14 +16222,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16304,15 +16284,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s0
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16375,15 +16352,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s0
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16445,12 +16419,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16463,20 +16434,48 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_0() {
 }
 
 define void @s_shuffle_v4i64_v4i64__u_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__u_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -16484,89 +16483,210 @@ define void @s_shuffle_v4i64_v4i64__u_1_1_1() {
 }
 
 define void @s_shuffle_v4i64_v4i64__0_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__0_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__1_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__1_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__2_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__2_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__1_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__2_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
 define void @s_shuffle_v4i64_v4i64__3_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__3_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -16574,20 +16694,48 @@ define void @s_shuffle_v4i64_v4i64__3_1_1_1() {
 }
 
 define void @s_shuffle_v4i64_v4i64__4_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__4_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 4, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
@@ -16644,12 +16792,9 @@ define void @s_shuffle_v4i64_v4i64__5_1_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16711,12 +16856,9 @@ define void @s_shuffle_v4i64_v4i64__6_1_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16778,12 +16920,9 @@ define void @s_shuffle_v4i64_v4i64__7_1_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16842,15 +16981,12 @@ define void @s_shuffle_v4i64_v4i64__7_u_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16913,17 +17049,13 @@ define void @s_shuffle_v4i64_v4i64__7_0_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16985,14 +17117,10 @@ define void @s_shuffle_v4i64_v4i64__7_2_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17054,14 +17182,10 @@ define void @s_shuffle_v4i64_v4i64__7_3_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17124,17 +17248,13 @@ define void @s_shuffle_v4i64_v4i64__7_4_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17196,12 +17316,9 @@ define void @s_shuffle_v4i64_v4i64__7_5_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17267,14 +17384,10 @@ define void @s_shuffle_v4i64_v4i64__7_6_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17337,15 +17450,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17404,13 +17514,11 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s2
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17473,15 +17581,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17539,14 +17644,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17604,14 +17705,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17670,15 +17767,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s2
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17741,15 +17835,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s2
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17811,12 +17902,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17829,149 +17917,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_1() {
 }
 
 define void @s_shuffle_v4i64_v4i64__u_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 poison, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__0_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__0_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__1_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__1_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__2_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__2_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__3_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__3_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__4_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 4, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__5_2_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2:
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -17981,17 +17932,12 @@ define void @s_shuffle_v4i64_v4i64__5_2_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
@@ -18001,42 +17947,31 @@ define void @s_shuffle_v4i64_v4i64__5_2_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__u_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 2, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 poison, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__6_2_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2:
+define void @s_shuffle_v4i64_v4i64__0_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -18046,15 +17981,12 @@ define void @s_shuffle_v4i64_v4i64__6_2_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
@@ -18064,42 +17996,31 @@ define void @s_shuffle_v4i64_v4i64__6_2_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 2, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_2_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2:
+define void @s_shuffle_v4i64_v4i64__1_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -18111,15 +18032,12 @@ define void @s_shuffle_v4i64_v4i64__7_2_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -18131,44 +18049,36 @@ define void @s_shuffle_v4i64_v4i64__7_2_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_u_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2:
+define void @s_shuffle_v4i64_v4i64__2_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18176,17 +18086,16 @@ define void @s_shuffle_v4i64_v4i64__7_u_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18194,115 +18103,88 @@ define void @s_shuffle_v4i64_v4i64__7_u_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_0_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2:
+define void @s_shuffle_v4i64_v4i64__3_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_1_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2:
+define void @s_shuffle_v4i64_v4i64__4_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18310,17 +18192,14 @@ define void @s_shuffle_v4i64_v4i64__7_1_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18328,32 +18207,26 @@ define void @s_shuffle_v4i64_v4i64__7_1_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__4_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 4, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_3_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2:
+define void @s_shuffle_v4i64_v4i64__5_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18362,10 +18235,10 @@ define void @s_shuffle_v4i64_v4i64__7_3_2_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18373,7 +18246,7 @@ define void @s_shuffle_v4i64_v4i64__7_3_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18382,10 +18255,10 @@ define void @s_shuffle_v4i64_v4i64__7_3_2_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18393,7 +18266,7 @@ define void @s_shuffle_v4i64_v4i64__7_3_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18402,25 +18275,22 @@ define void @s_shuffle_v4i64_v4i64__7_3_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_4_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2:
+define void @s_shuffle_v4i64_v4i64__6_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18429,10 +18299,8 @@ define void @s_shuffle_v4i64_v4i64__7_4_2_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18440,7 +18308,7 @@ define void @s_shuffle_v4i64_v4i64__7_4_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18449,10 +18317,8 @@ define void @s_shuffle_v4i64_v4i64__7_4_2_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18460,7 +18326,7 @@ define void @s_shuffle_v4i64_v4i64__7_4_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18469,65 +18335,62 @@ define void @s_shuffle_v4i64_v4i64__7_4_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_5_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2:
+define void @s_shuffle_v4i64_v4i64__7_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18536,37 +18399,32 @@ define void @s_shuffle_v4i64_v4i64__7_5_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_6_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2:
+define void @s_shuffle_v4i64_v4i64__7_u_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18574,19 +18432,17 @@ define void @s_shuffle_v4i64_v4i64__7_6_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18594,7 +18450,7 @@ define void @s_shuffle_v4i64_v4i64__7_6_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18603,61 +18459,65 @@ define void @s_shuffle_v4i64_v4i64__7_6_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_2_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2:
+define void @s_shuffle_v4i64_v4i64__7_0_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18666,35 +18526,33 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_u_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2:
+define void @s_shuffle_v4i64_v4i64__7_1_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[16:23]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18702,17 +18560,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[16:23]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18720,7 +18578,7 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18729,61 +18587,61 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_0_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2:
+define void @s_shuffle_v4i64_v4i64__7_3_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18792,67 +18650,62 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_1_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2:
+define void @s_shuffle_v4i64_v4i64__7_4_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18861,39 +18714,34 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_3_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2:
+define void @s_shuffle_v4i64_v4i64__7_5_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
 ; GFX900-NEXT:    s_mov_b32 s14, s16
 ; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18901,19 +18749,19 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
 ; GFX90A-NEXT:    s_mov_b32 s14, s16
 ; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18921,7 +18769,7 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18930,203 +18778,182 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_4_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2:
+define void @s_shuffle_v4i64_v4i64__7_6_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[16:23]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[16:23]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_5_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2:
+define void @s_shuffle_v4i64_v4i64__7_7_2_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 2>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_6_2() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2:
+define void @s_shuffle_v4i64_v4i64__7_7_u_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19135,197 +18962,58 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__u_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 poison, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__0_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__0_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 0, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__1_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__1_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 1, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__2_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__2_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__3_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__3_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__4_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 4, i32 3, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__5_3_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3:
+define void @s_shuffle_v4i64_v4i64__7_7_0_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19334,61 +19022,63 @@ define void @s_shuffle_v4i64_v4i64__5_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 3, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__6_3_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3:
+define void @s_shuffle_v4i64_v4i64__7_7_1_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19397,65 +19087,63 @@ define void @s_shuffle_v4i64_v4i64__6_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 3, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_3_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3:
+define void @s_shuffle_v4i64_v4i64__7_7_3_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19464,86 +19152,87 @@ define void @s_shuffle_v4i64_v4i64__7_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_u_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3:
+define void @s_shuffle_v4i64_v4i64__7_7_4_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_0_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3:
+define void @s_shuffle_v4i64_v4i64__7_7_5_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19554,18 +19243,16 @@ define void @s_shuffle_v4i64_v4i64__7_0_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19576,83 +19263,81 @@ define void @s_shuffle_v4i64_v4i64__7_0_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_1_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3:
+define void @s_shuffle_v4i64_v4i64__7_7_6_2() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19661,35 +19346,29 @@ define void @s_shuffle_v4i64_v4i64__7_1_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_2_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3:
+define void @s_shuffle_v4i64_v4i64__u_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19697,19 +19376,14 @@ define void @s_shuffle_v4i64_v4i64__7_2_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19717,46 +19391,33 @@ define void @s_shuffle_v4i64_v4i64__7_2_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__u_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 poison, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_4_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3:
+define void @s_shuffle_v4i64_v4i64__0_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19764,19 +19425,14 @@ define void @s_shuffle_v4i64_v4i64__7_4_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19784,113 +19440,89 @@ define void @s_shuffle_v4i64_v4i64__7_4_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 0, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_5_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3:
+define void @s_shuffle_v4i64_v4i64__1_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 1, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_6_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3:
+define void @s_shuffle_v4i64_v4i64__2_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19898,19 +19530,16 @@ define void @s_shuffle_v4i64_v4i64__7_6_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19918,44 +19547,36 @@ define void @s_shuffle_v4i64_v4i64__7_6_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_3_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3:
+define void @s_shuffle_v4i64_v4i64__3_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19963,17 +19584,16 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19981,127 +19601,116 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_u_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3:
+define void @s_shuffle_v4i64_v4i64__4_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__4_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 4, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_0_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3:
+define void @s_shuffle_v4i64_v4i64__5_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20110,67 +19719,58 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_1_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3:
+define void @s_shuffle_v4i64_v4i64__6_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20179,27 +19779,22 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_2_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3:
+define void @s_shuffle_v4i64_v4i64__7_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -20210,12 +19805,16 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -20226,12 +19825,16 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20240,23 +19843,22 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_4_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3:
+define void @s_shuffle_v4i64_v4i64__7_u_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -20267,14 +19869,14 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -20285,14 +19887,14 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20301,61 +19903,65 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_5_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3:
+define void @s_shuffle_v4i64_v4i64__7_0_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20364,65 +19970,59 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 3>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_6_3() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3:
+define void @s_shuffle_v4i64_v4i64__7_1_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[16:23]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[16:23]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20431,584 +20031,609 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__u_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 poison, i32 4, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__0_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4:
+define void @s_shuffle_v4i64_v4i64__7_2_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__1_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4:
+define void @s_shuffle_v4i64_v4i64__7_4_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4:
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 1, i32 4, i32 4, i32 4>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__2_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4:
+define void @s_shuffle_v4i64_v4i64__7_5_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 2, i32 4, i32 4, i32 4>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__3_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4:
+define void @s_shuffle_v4i64_v4i64__7_6_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 3, i32 4, i32 4, i32 4>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__4_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__5_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4:
+define void @s_shuffle_v4i64_v4i64__7_7_3_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 4, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__6_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4:
+define void @s_shuffle_v4i64_v4i64__7_7_u_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 4, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4:
+define void @s_shuffle_v4i64_v4i64__7_7_0_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_u_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4:
+define void @s_shuffle_v4i64_v4i64__7_7_1_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_0_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4:
+define void @s_shuffle_v4i64_v4i64__7_7_2_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_1_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4:
+define void @s_shuffle_v4i64_v4i64__7_7_4_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -21017,25 +20642,22 @@ define void @s_shuffle_v4i64_v4i64__7_1_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_2_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4:
+define void @s_shuffle_v4i64_v4i64__7_7_5_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -21046,18 +20668,14 @@ define void @s_shuffle_v4i64_v4i64__7_2_4_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21068,304 +20686,240 @@ define void @s_shuffle_v4i64_v4i64__7_2_4_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s16
-; GFX942-NEXT:    s_mov_b32 s13, s17
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_3_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4:
+define void @s_shuffle_v4i64_v4i64__7_7_6_3() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s16
-; GFX942-NEXT:    s_mov_b32 s13, s17
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_5_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4:
+define void @s_shuffle_v4i64_v4i64__u_4_4_4() {
+; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_4_4_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:15]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 poison, i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__0_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s6
-; GFX900-NEXT:    s_mov_b32 s11, s7
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s6
-; GFX90A-NEXT:    s_mov_b32 s11, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_6_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4:
+define void @s_shuffle_v4i64_v4i64__1_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 1, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_4_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4:
+define void @s_shuffle_v4i64_v4i64__2_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 2, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_u_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4:
+define void @s_shuffle_v4i64_v4i64__3_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -21373,14 +20927,12 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21388,190 +20940,171 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 3, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_0_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4:
+define void @s_shuffle_v4i64_v4i64__4_4_4_4() {
+; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_4_4_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:15]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__5_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s22
-; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s4
 ; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s22
-; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s4
 ; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_1_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4:
+define void @s_shuffle_v4i64_v4i64__6_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s22
-; GFX900-NEXT:    s_mov_b32 s11, s23
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s22
-; GFX90A-NEXT:    s_mov_b32 s11, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_2_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4:
+define void @s_shuffle_v4i64_v4i64__7_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    s_mov_b32 s14, s4
 ; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -21579,17 +21112,18 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    s_mov_b32 s14, s4
 ; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21597,46 +21131,38 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_3_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4:
+define void @s_shuffle_v4i64_v4i64__7_u_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    s_mov_b32 s14, s4
 ; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -21644,19 +21170,16 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    s_mov_b32 s14, s4
 ; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21664,107 +21187,110 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:23]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s22
-; GFX942-NEXT:    s_mov_b32 s11, s23
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_5_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4:
+define void @s_shuffle_v4i64_v4i64__7_0_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_6_4() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4:
+define void @s_shuffle_v4i64_v4i64__7_1_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:23]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s22
 ; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s22
-; GFX900-NEXT:    s_mov_b32 s11, s23
-; GFX900-NEXT:    s_mov_b32 s12, s20
-; GFX900-NEXT:    s_mov_b32 s13, s21
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
 ; GFX900-NEXT:    s_mov_b32 s14, s16
 ; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -21772,18 +21298,19 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:23]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s22
 ; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s22
-; GFX90A-NEXT:    s_mov_b32 s11, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s20
-; GFX90A-NEXT:    s_mov_b32 s13, s21
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
 ; GFX90A-NEXT:    s_mov_b32 s14, s16
 ; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21791,65 +21318,1353 @@ define void @s_shuffle_v4i64_v4i64__7_7_6_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 4>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_2_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_3_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_5_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_6_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_7_4_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_7_u_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_7_0_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s22
+; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s22
+; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_7_1_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s22
+; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s22
+; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_7_2_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_7_3_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_7_5_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_7_6_4() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s22
+; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    s_mov_b32 s12, s20
+; GFX900-NEXT:    s_mov_b32 s13, s21
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s22
+; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s20
+; GFX90A-NEXT:    s_mov_b32 s13, s21
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
 define void @s_shuffle_v4i64_v4i64__u_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__u_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 poison, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__0_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 0, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__1_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 1, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__2_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__3_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 3, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__4_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__4_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 4, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__5_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__6_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_u_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 poison, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__0_5_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5:
+define void @s_shuffle_v4i64_v4i64__7_0_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -21857,17 +22672,19 @@ define void @s_shuffle_v4i64_v4i64__0_5_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21875,74 +22692,69 @@ define void @s_shuffle_v4i64_v4i64__0_5_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 0, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__1_5_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5:
+define void @s_shuffle_v4i64_v4i64__7_1_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
-; GFX900-NEXT:    s_mov_b32 s14, s10
-; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s10
-; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -21951,253 +22763,150 @@ define void @s_shuffle_v4i64_v4i64__1_5_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 1, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__2_5_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5:
+define void @s_shuffle_v4i64_v4i64__7_2_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
-; GFX900-NEXT:    s_mov_b32 s14, s10
-; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s10
-; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__3_5_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5:
+define void @s_shuffle_v4i64_v4i64__7_3_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
-; GFX900-NEXT:    s_mov_b32 s14, s10
-; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s10
-; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 3, i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__4_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 4, i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__5_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__5_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__6_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__6_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__7_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_u_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5:
+define void @s_shuffle_v4i64_v4i64__7_4_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22205,6 +22914,8 @@ define void @s_shuffle_v4i64_v4i64__7_u_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
@@ -22214,7 +22925,7 @@ define void @s_shuffle_v4i64_v4i64__7_u_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22222,6 +22933,8 @@ define void @s_shuffle_v4i64_v4i64__7_u_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
@@ -22231,43 +22944,38 @@ define void @s_shuffle_v4i64_v4i64__7_u_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_0_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5:
+define void @s_shuffle_v4i64_v4i64__7_6_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22275,19 +22983,16 @@ define void @s_shuffle_v4i64_v4i64__7_0_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22295,114 +23000,92 @@ define void @s_shuffle_v4i64_v4i64__7_0_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 0, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_1_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5:
+define void @s_shuffle_v4i64_v4i64__7_7_5_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 1, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_2_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5:
+define void @s_shuffle_v4i64_v4i64__7_7_u_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22410,21 +23093,14 @@ define void @s_shuffle_v4i64_v4i64__7_2_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22432,34 +23108,28 @@ define void @s_shuffle_v4i64_v4i64__7_2_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 2, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_3_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5:
+define void @s_shuffle_v4i64_v4i64__7_7_0_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22470,14 +23140,16 @@ define void @s_shuffle_v4i64_v4i64__7_3_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22488,174 +23160,176 @@ define void @s_shuffle_v4i64_v4i64__7_3_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 3, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_4_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5:
+define void @s_shuffle_v4i64_v4i64__7_7_1_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 4, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_6_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5:
+define void @s_shuffle_v4i64_v4i64__7_7_2_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_5_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5:
+define void @s_shuffle_v4i64_v4i64__7_7_3_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22663,16 +23337,19 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22680,33 +23357,31 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_u_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5:
+define void @s_shuffle_v4i64_v4i64__7_7_4_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22714,6 +23389,8 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22721,7 +23398,7 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22729,6 +23406,8 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22736,201 +23415,175 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_0_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5:
+define void @s_shuffle_v4i64_v4i64__7_7_6_5() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    s_mov_b32 s10, s18
 ; GFX900-NEXT:    s_mov_b32 s11, s19
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    s_mov_b32 s10, s18
 ; GFX90A-NEXT:    s_mov_b32 s11, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_1_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_5:
+define void @s_shuffle_v4i64_v4i64__u_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s18
-; GFX900-NEXT:    s_mov_b32 s11, s19
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s18
-; GFX90A-NEXT:    s_mov_b32 s11, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_1_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__u_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 poison, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_2_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_5:
+define void @s_shuffle_v4i64_v4i64__0_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_2_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -22939,311 +23592,271 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 0, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_3_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_5:
+define void @s_shuffle_v4i64_v4i64__1_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_3_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 1, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_4_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5:
+define void @s_shuffle_v4i64_v4i64__2_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 5>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 2, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__7_7_6_5() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5:
+define void @s_shuffle_v4i64_v4i64__3_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s18
-; GFX900-NEXT:    s_mov_b32 s11, s19
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s18
-; GFX90A-NEXT:    s_mov_b32 s11, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__u_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 poison, i32 6, i32 6, i32 6>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 3, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__0_6_6_6() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_6_6_6:
+define void @s_shuffle_v4i64_v4i64__4_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_6_6_6:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__0_6_6_6:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__4_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 0, i32 6, i32 6, i32 6>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 4, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__1_6_6_6() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_6_6_6:
+define void @s_shuffle_v4i64_v4i64__5_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -23253,17 +23866,14 @@ define void @s_shuffle_v4i64_v4i64__1_6_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_6_6_6:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
@@ -23273,44 +23883,35 @@ define void @s_shuffle_v4i64_v4i64__1_6_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_6_6_6:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 1, i32 6, i32 6, i32 6>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__2_6_6_6() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_6_6_6:
+define void @s_shuffle_v4i64_v4i64__6_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -23320,17 +23921,14 @@ define void @s_shuffle_v4i64_v4i64__2_6_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_6_6_6:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
@@ -23340,44 +23938,35 @@ define void @s_shuffle_v4i64_v4i64__2_6_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_6_6_6:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 2, i32 6, i32 6, i32 6>
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4i64_v4i64__3_6_6_6() {
-; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_6_6_6:
+define void @s_shuffle_v4i64_v4i64__7_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -23387,17 +23976,14 @@ define void @s_shuffle_v4i64_v4i64__3_6_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_6_6_6:
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
@@ -23407,119 +23993,19 @@ define void @s_shuffle_v4i64_v4i64__3_6_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_6_6_6:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 3, i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__4_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 4, i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__5_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__5_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__6_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__6_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__7_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 6, i32 6>
@@ -23528,20 +24014,48 @@ define void @s_shuffle_v4i64_v4i64__7_6_6_6() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_u_6_6() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_u_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_6_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 6, i32 6>
@@ -23599,12 +24113,9 @@ define void @s_shuffle_v4i64_v4i64__7_0_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23666,12 +24177,9 @@ define void @s_shuffle_v4i64_v4i64__7_1_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23733,12 +24241,9 @@ define void @s_shuffle_v4i64_v4i64__7_2_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23800,12 +24305,9 @@ define void @s_shuffle_v4i64_v4i64__7_3_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23862,14 +24364,10 @@ define void @s_shuffle_v4i64_v4i64__7_4_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23882,20 +24380,48 @@ define void @s_shuffle_v4i64_v4i64__7_4_6_6() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_5_6_6() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_5_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_6_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 6, i32 6>
@@ -23904,22 +24430,53 @@ define void @s_shuffle_v4i64_v4i64__7_5_6_6() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_7_6_6() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 6>
@@ -23968,12 +24525,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24039,14 +24593,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24112,14 +24662,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24181,12 +24727,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24252,14 +24795,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24312,14 +24851,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24376,14 +24911,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24396,20 +24927,48 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_6() {
 }
 
 define void @s_shuffle_v4i64_v4i64__u_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__u_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__u_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 poison, i32 7, i32 7, i32 7>
@@ -24467,12 +25026,9 @@ define void @s_shuffle_v4i64_v4i64__0_7_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24534,12 +25090,9 @@ define void @s_shuffle_v4i64_v4i64__1_7_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24601,12 +25154,9 @@ define void @s_shuffle_v4i64_v4i64__2_7_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24659,117 +25209,232 @@ define void @s_shuffle_v4i64_v4i64__3_7_7_7() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_7_7_7:
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 3, i32 7, i32 7, i32 7>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__4_7_7_7() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__4_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 4, i32 7, i32 7, i32 7>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__5_7_7_7() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 7, i32 7, i32 7>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__6_7_7_7() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x i64> asm "; def $0", "=s"()
+  %vec1 = call <4 x i64> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 7, i32 7, i32 7>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4i64_v4i64__7_u_7_7() {
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_7_7:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 3, i32 7, i32 7, i32 7>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__4_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__4_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 4, i32 7, i32 7, i32 7>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__5_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__5_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 5, i32 7, i32 7, i32 7>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__6_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__6_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x i64> asm "; def $0", "=s"()
-  %vec1 = call <4 x i64> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 6, i32 7, i32 7, i32 7>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4i64_v4i64__7_u_7_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_u_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 poison, i32 7, i32 7>
@@ -24827,12 +25492,9 @@ define void @s_shuffle_v4i64_v4i64__7_0_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24894,12 +25556,9 @@ define void @s_shuffle_v4i64_v4i64__7_1_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24961,12 +25620,9 @@ define void @s_shuffle_v4i64_v4i64__7_2_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25028,12 +25684,9 @@ define void @s_shuffle_v4i64_v4i64__7_3_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25090,14 +25743,10 @@ define void @s_shuffle_v4i64_v4i64__7_4_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25110,20 +25759,48 @@ define void @s_shuffle_v4i64_v4i64__7_4_7_7() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_5_7_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_5_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_5_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 5, i32 7, i32 7>
@@ -25132,22 +25809,53 @@ define void @s_shuffle_v4i64_v4i64__7_5_7_7() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_6_7_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_6_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 6, i32 7, i32 7>
@@ -25156,20 +25864,48 @@ define void @s_shuffle_v4i64_v4i64__7_6_7_7() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_7_u_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_u_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 7>
@@ -25227,12 +25963,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25294,12 +26027,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25361,12 +26091,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_2_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25428,12 +26155,9 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25486,14 +26210,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25550,14 +26270,10 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25570,20 +26286,48 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_7() {
 }
 
 define void @s_shuffle_v4i64_v4i64__7_7_6_7() {
-; GFX9-LABEL: s_shuffle_v4i64_v4i64__7_7_6_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_6_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x i64> asm "; def $0", "=s"()
   %vec1 = call <4 x i64> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll
index b30af835a7882..14bacc2f74876 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll
@@ -3734,8 +3734,7 @@ define void @s_shuffle_v4p0_v2p0__1_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3793,8 +3792,7 @@ define void @s_shuffle_v4p0_v2p0__3_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3852,10 +3850,8 @@ define void @s_shuffle_v4p0_v2p0__3_0_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3909,8 +3905,7 @@ define void @s_shuffle_v4p0_v2p0__3_1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3959,10 +3954,8 @@ define void @s_shuffle_v4p0_v2p0__3_2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3975,18 +3968,43 @@ define void @s_shuffle_v4p0_v2p0__3_2_u_u() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_u_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_u_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_u_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_u_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 poison>
@@ -3995,21 +4013,52 @@ define void @s_shuffle_v4p0_v2p0__3_3_u_u() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_0_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_0_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_0_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_0_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_0_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 poison>
@@ -4063,10 +4112,8 @@ define void @s_shuffle_v4p0_v2p0__3_3_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4079,20 +4126,48 @@ define void @s_shuffle_v4p0_v2p0__3_3_1_u() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_2_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_2_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_2_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_2_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_2_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 2, i32 poison>
@@ -4101,20 +4176,48 @@ define void @s_shuffle_v4p0_v2p0__3_3_2_u() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_3_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_3_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_3_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_3_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_3_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 3, i32 poison>
@@ -4172,12 +4275,9 @@ define void @s_shuffle_v4p0_v2p0__3_3_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4190,23 +4290,57 @@ define void @s_shuffle_v4p0_v2p0__3_3_3_0() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_3_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_3_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_3_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_3_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_3_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 3, i32 1>
@@ -4259,14 +4393,10 @@ define void @s_shuffle_v4p0_v2p0__3_3_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4279,44 +4409,103 @@ define void @s_shuffle_v4p0_v2p0__3_3_3_2() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v2p0__u_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__u_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v2p0__u_0_0_0() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__u_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__u_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__u_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 poison, i32 0, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -4324,22 +4513,53 @@ define void @s_shuffle_v4p0_v2p0__u_0_0_0() {
 }
 
 define void @s_shuffle_v4p0_v2p0__0_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__0_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__0_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    s_mov_b32 s14, s8
+; GFX900-NEXT:    s_mov_b32 s15, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__0_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    s_mov_b32 s14, s8
+; GFX90A-NEXT:    s_mov_b32 s15, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__0_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -4347,22 +4567,53 @@ define void @s_shuffle_v4p0_v2p0__0_0_0_0() {
 }
 
 define void @s_shuffle_v4p0_v2p0__1_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__1_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__1_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__1_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__1_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -4370,20 +4621,48 @@ define void @s_shuffle_v4p0_v2p0__1_0_0_0() {
 }
 
 define void @s_shuffle_v4p0_v2p0__2_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__2_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__2_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__2_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__2_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -4440,12 +4719,9 @@ define void @s_shuffle_v4p0_v2p0__3_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4503,10 +4779,8 @@ define void @s_shuffle_v4p0_v2p0__3_u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4568,12 +4842,9 @@ define void @s_shuffle_v4p0_v2p0__3_1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4635,12 +4906,9 @@ define void @s_shuffle_v4p0_v2p0__3_2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4653,57 +4921,91 @@ define void @s_shuffle_v4p0_v2p0__3_2_0_0() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 0>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v2p0__3_3_u_0() {
-; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_u_0:
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_0_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ; def s[12:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ; def s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_u_0:
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_0_0:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ; def s[12:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v2p0__3_3_u_0() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_u_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_u_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
@@ -4723,10 +5025,8 @@ define void @s_shuffle_v4p0_v2p0__3_3_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4788,12 +5088,9 @@ define void @s_shuffle_v4p0_v2p0__3_3_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4855,12 +5152,9 @@ define void @s_shuffle_v4p0_v2p0__3_3_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -4873,20 +5167,48 @@ define void @s_shuffle_v4p0_v2p0__3_3_2_0() {
 }
 
 define void @s_shuffle_v4p0_v2p0__u_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__u_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__u_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__u_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__u_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -4894,20 +5216,48 @@ define void @s_shuffle_v4p0_v2p0__u_1_1_1() {
 }
 
 define void @s_shuffle_v4p0_v2p0__0_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__0_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__0_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__0_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__0_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -4915,22 +5265,53 @@ define void @s_shuffle_v4p0_v2p0__0_1_1_1() {
 }
 
 define void @s_shuffle_v4p0_v2p0__1_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__1_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__1_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__1_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__1_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -4938,20 +5319,48 @@ define void @s_shuffle_v4p0_v2p0__1_1_1_1() {
 }
 
 define void @s_shuffle_v4p0_v2p0__2_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__2_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__2_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__2_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__2_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -5008,12 +5417,9 @@ define void @s_shuffle_v4p0_v2p0__3_1_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5071,10 +5477,8 @@ define void @s_shuffle_v4p0_v2p0__3_u_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5136,12 +5540,9 @@ define void @s_shuffle_v4p0_v2p0__3_0_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5203,12 +5604,9 @@ define void @s_shuffle_v4p0_v2p0__3_2_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5221,46 +5619,111 @@ define void @s_shuffle_v4p0_v2p0__3_2_1_1() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
 
 define void @s_shuffle_v4p0_v2p0__3_3_u_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_u_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_u_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_u_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_u_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 1>
@@ -5269,21 +5732,52 @@ define void @s_shuffle_v4p0_v2p0__3_3_u_1() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_0_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_0_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_0_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_0_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_0_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 1>
@@ -5341,12 +5835,9 @@ define void @s_shuffle_v4p0_v2p0__3_3_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5445,8 +5936,7 @@ define void @s_shuffle_v4p0_v2p0__1_2_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5472,22 +5962,53 @@ define void @s_shuffle_v4p0_v2p0__2_2_2_2() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
@@ -5496,20 +6017,48 @@ define void @s_shuffle_v4p0_v2p0__3_2_2_2() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_u_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_u_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_u_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_u_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_u_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 poison, i32 2, i32 2>
@@ -5567,12 +6116,9 @@ define void @s_shuffle_v4p0_v2p0__3_0_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5585,23 +6131,57 @@ define void @s_shuffle_v4p0_v2p0__3_0_2_2() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_1_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_1_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_1_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_1_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_1_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 1, i32 2, i32 2>
@@ -5610,22 +6190,53 @@ define void @s_shuffle_v4p0_v2p0__3_1_2_2() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 2, i32 2>
@@ -5674,12 +6285,9 @@ define void @s_shuffle_v4p0_v2p0__3_3_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5741,12 +6349,9 @@ define void @s_shuffle_v4p0_v2p0__3_3_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5812,14 +6417,10 @@ define void @s_shuffle_v4p0_v2p0__3_3_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5832,20 +6433,48 @@ define void @s_shuffle_v4p0_v2p0__3_3_1_2() {
 }
 
 define void @s_shuffle_v4p0_v2p0__u_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__u_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__u_3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__u_3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__u_3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 poison, i32 3, i32 3, i32 3>
@@ -5854,23 +6483,57 @@ define void @s_shuffle_v4p0_v2p0__u_3_3_3() {
 }
 
 define void @s_shuffle_v4p0_v2p0__0_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__0_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__0_3_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__0_3_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__0_3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 0, i32 3, i32 3, i32 3>
@@ -5928,12 +6591,9 @@ define void @s_shuffle_v4p0_v2p0__1_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -5946,78 +6606,134 @@ define void @s_shuffle_v4p0_v2p0__1_3_3_3() {
 }
 
 define void @s_shuffle_v4p0_v2p0__2_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__2_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v2p0__3_u_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_u_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 poison, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v2p0__3_0_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_0_3_3:
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__2_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ; def s[8:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_0_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__2_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ; def s[8:11]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:7]
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__2_3_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v2p0__3_u_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_u_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_u_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_u_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <2 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 poison, i32 3, i32 3>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v2p0__3_0_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_0_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_0_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:7]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s14
 ; GFX90A-NEXT:    s_mov_b32 s9, s15
@@ -6039,12 +6755,9 @@ define void @s_shuffle_v4p0_v2p0__3_0_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6057,23 +6770,57 @@ define void @s_shuffle_v4p0_v2p0__3_0_3_3() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_1_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_1_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_1_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_1_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_1_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 1, i32 3, i32 3>
@@ -6082,22 +6829,53 @@ define void @s_shuffle_v4p0_v2p0__3_1_3_3() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_2_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_2_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_2_3_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_2_3_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_2_3_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 2, i32 3, i32 3>
@@ -6106,20 +6884,48 @@ define void @s_shuffle_v4p0_v2p0__3_2_3_3() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_u_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_u_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_u_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_u_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_u_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 poison, i32 3>
@@ -6128,23 +6934,57 @@ define void @s_shuffle_v4p0_v2p0__3_3_u_3() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_0_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_0_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:11]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_0_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_0_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_0_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 0, i32 3>
@@ -6202,12 +7042,9 @@ define void @s_shuffle_v4p0_v2p0__3_3_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -6220,20 +7057,48 @@ define void @s_shuffle_v4p0_v2p0__3_3_1_3() {
 }
 
 define void @s_shuffle_v4p0_v2p0__3_3_2_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v2p0__3_3_2_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[12:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_2_3:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_2_3:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_2_3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> <i32 3, i32 3, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
index e6ac554735eee..0398418b82f3d 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
@@ -7818,8 +7818,7 @@ define void @s_shuffle_v4p0_v3p0__1_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7859,8 +7858,7 @@ define void @s_shuffle_v4p0_v3p0__2_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7918,8 +7916,7 @@ define void @s_shuffle_v4p0_v3p0__4_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -7960,8 +7957,7 @@ define void @s_shuffle_v4p0_v3p0__5_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8016,11 +8012,11 @@ define void @s_shuffle_v4p0_v3p0__5_0_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8070,8 +8066,7 @@ define void @s_shuffle_v4p0_v3p0__5_1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8125,10 +8120,8 @@ define void @s_shuffle_v4p0_v3p0__5_2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8173,10 +8166,8 @@ define void @s_shuffle_v4p0_v3p0__5_3_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8189,18 +8180,43 @@ define void @s_shuffle_v4p0_v3p0__5_3_u_u() {
 }
 
 define void @s_shuffle_v4p0_v3p0__5_4_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_4_u_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_u_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_u_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_4_u_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 4, i32 poison, i32 poison>
@@ -8245,10 +8261,8 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8310,12 +8324,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8377,12 +8388,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8440,10 +8448,8 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8492,12 +8498,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8550,12 +8553,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8568,20 +8568,48 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_u() {
 }
 
 define void @s_shuffle_v4p0_v3p0__5_5_5_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_5_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_5_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_5_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_5_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 poison>
@@ -8639,12 +8667,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8706,12 +8731,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8773,12 +8795,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8835,14 +8854,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8895,14 +8910,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8915,22 +8926,53 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_4() {
 }
 
 define void @s_shuffle_v4p0_v3p0__5_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
@@ -8979,12 +9021,9 @@ define void @s_shuffle_v4p0_v3p0__u_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -8996,22 +9035,53 @@ define void @s_shuffle_v4p0_v3p0__u_0_0_0() {
 }
 
 define void @s_shuffle_v4p0_v3p0__0_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__0_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    s_mov_b32 s14, s8
+; GFX900-NEXT:    s_mov_b32 s15, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    s_mov_b32 s14, s8
+; GFX90A-NEXT:    s_mov_b32 s15, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__0_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -9063,14 +9133,10 @@ define void @s_shuffle_v4p0_v3p0__1_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9122,14 +9188,10 @@ define void @s_shuffle_v4p0_v3p0__2_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9181,12 +9243,9 @@ define void @s_shuffle_v4p0_v3p0__3_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9248,17 +9307,13 @@ define void @s_shuffle_v4p0_v3p0__4_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9321,15 +9376,12 @@ define void @s_shuffle_v4p0_v3p0__5_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9388,13 +9440,11 @@ define void @s_shuffle_v4p0_v3p0__5_u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9457,15 +9507,12 @@ define void @s_shuffle_v4p0_v3p0__5_1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9527,14 +9574,10 @@ define void @s_shuffle_v4p0_v3p0__5_2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9597,15 +9640,12 @@ define void @s_shuffle_v4p0_v3p0__5_3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9667,12 +9707,9 @@ define void @s_shuffle_v4p0_v3p0__5_4_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9738,14 +9775,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9807,12 +9840,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9878,14 +9908,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -9951,14 +9977,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10020,12 +10042,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10091,14 +10110,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10111,126 +10126,12 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_0() {
 }
 
 define void @s_shuffle_v4p0_v3p0__u_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v3p0__0_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__0_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v3p0__1_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__1_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v3p0__2_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__2_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v3p0__3_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v3p0__4_1_1_1() {
-; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_1_1_1:
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:13]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    s_mov_b32 s12, s10
 ; GFX900-NEXT:    s_mov_b32 s13, s11
 ; GFX900-NEXT:    s_mov_b32 s14, s10
@@ -10240,13 +10141,273 @@ define void @s_shuffle_v4p0_v3p0__4_1_1_1() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_1_1_1:
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_1_1_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:13]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__u_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v3p0__0_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__0_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v3p0__1_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__1_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v3p0__2_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__2_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v3p0__3_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__3_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <3 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v3p0__4_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:9]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s6
@@ -10269,12 +10430,9 @@ define void @s_shuffle_v4p0_v3p0__4_1_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10332,12 +10490,9 @@ define void @s_shuffle_v4p0_v3p0__5_1_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10396,13 +10551,11 @@ define void @s_shuffle_v4p0_v3p0__5_u_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10465,15 +10618,12 @@ define void @s_shuffle_v4p0_v3p0__5_0_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10531,14 +10681,10 @@ define void @s_shuffle_v4p0_v3p0__5_2_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10601,15 +10747,12 @@ define void @s_shuffle_v4p0_v3p0__5_3_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10671,12 +10814,9 @@ define void @s_shuffle_v4p0_v3p0__5_4_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10742,14 +10882,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10811,12 +10947,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10882,14 +11015,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -10951,14 +11080,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11020,12 +11145,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11091,14 +11213,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11111,20 +11229,48 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_1() {
 }
 
 define void @s_shuffle_v4p0_v3p0__u_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__u_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 poison, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -11132,20 +11278,48 @@ define void @s_shuffle_v4p0_v3p0__u_2_2_2() {
 }
 
 define void @s_shuffle_v4p0_v3p0__0_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__0_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__0_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -11153,22 +11327,53 @@ define void @s_shuffle_v4p0_v3p0__0_2_2_2() {
 }
 
 define void @s_shuffle_v4p0_v3p0__1_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__1_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__1_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -11176,22 +11381,53 @@ define void @s_shuffle_v4p0_v3p0__1_2_2_2() {
 }
 
 define void @s_shuffle_v4p0_v3p0__2_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__2_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__2_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -11199,20 +11435,48 @@ define void @s_shuffle_v4p0_v3p0__2_2_2_2() {
 }
 
 define void @s_shuffle_v4p0_v3p0__3_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_2_2_2:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_2_2_2:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__3_2_2_2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -11269,12 +11533,9 @@ define void @s_shuffle_v4p0_v3p0__4_2_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11332,12 +11593,9 @@ define void @s_shuffle_v4p0_v3p0__5_2_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11391,10 +11649,8 @@ define void @s_shuffle_v4p0_v3p0__5_u_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11456,14 +11712,10 @@ define void @s_shuffle_v4p0_v3p0__5_0_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11517,10 +11769,8 @@ define void @s_shuffle_v4p0_v3p0__5_1_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11578,12 +11828,9 @@ define void @s_shuffle_v4p0_v3p0__5_3_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11645,12 +11892,9 @@ define void @s_shuffle_v4p0_v3p0__5_4_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11712,12 +11956,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11779,12 +12020,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11846,14 +12084,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11919,14 +12153,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -11992,12 +12222,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12063,14 +12290,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12169,8 +12392,7 @@ define void @s_shuffle_v4p0_v3p0__1_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12210,8 +12432,7 @@ define void @s_shuffle_v4p0_v3p0__2_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12281,14 +12502,10 @@ define void @s_shuffle_v4p0_v3p0__4_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12341,14 +12558,10 @@ define void @s_shuffle_v4p0_v3p0__5_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12397,12 +12610,9 @@ define void @s_shuffle_v4p0_v3p0__5_u_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12465,15 +12675,12 @@ define void @s_shuffle_v4p0_v3p0__5_0_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12531,12 +12738,9 @@ define void @s_shuffle_v4p0_v3p0__5_1_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12598,14 +12802,10 @@ define void @s_shuffle_v4p0_v3p0__5_2_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:21]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s20
-; GFX942-NEXT:    s_mov_b32 s9, s21
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s16
-; GFX942-NEXT:    s_mov_b32 s13, s17
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12658,14 +12858,10 @@ define void @s_shuffle_v4p0_v3p0__5_4_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12722,14 +12918,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12782,12 +12974,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12853,14 +13042,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:21]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s20
-; GFX942-NEXT:    s_mov_b32 s9, s21
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12926,14 +13111,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[16:21]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s20
-; GFX942-NEXT:    s_mov_b32 s9, s21
-; GFX942-NEXT:    s_mov_b32 s10, s20
-; GFX942-NEXT:    s_mov_b32 s11, s21
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[20:21]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -12995,12 +13176,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13057,14 +13235,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13077,20 +13251,48 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_3() {
 }
 
 define void @s_shuffle_v4p0_v3p0__u_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__u_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 poison, i32 4, i32 4, i32 4>
@@ -13144,12 +13346,9 @@ define void @s_shuffle_v4p0_v3p0__0_4_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13211,12 +13410,9 @@ define void @s_shuffle_v4p0_v3p0__1_4_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13278,12 +13474,9 @@ define void @s_shuffle_v4p0_v3p0__2_4_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13296,20 +13489,48 @@ define void @s_shuffle_v4p0_v3p0__2_4_4_4() {
 }
 
 define void @s_shuffle_v4p0_v3p0__3_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__3_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 3, i32 4, i32 4, i32 4>
@@ -13318,22 +13539,53 @@ define void @s_shuffle_v4p0_v3p0__3_4_4_4() {
 }
 
 define void @s_shuffle_v4p0_v3p0__4_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__4_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__4_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
@@ -13342,22 +13594,53 @@ define void @s_shuffle_v4p0_v3p0__4_4_4_4() {
 }
 
 define void @s_shuffle_v4p0_v3p0__5_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_4_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 4, i32 4, i32 4>
@@ -13402,12 +13685,9 @@ define void @s_shuffle_v4p0_v3p0__5_u_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13466,15 +13746,12 @@ define void @s_shuffle_v4p0_v3p0__5_0_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:9]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13532,12 +13809,9 @@ define void @s_shuffle_v4p0_v3p0__5_1_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13599,12 +13873,9 @@ define void @s_shuffle_v4p0_v3p0__5_2_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13657,14 +13928,10 @@ define void @s_shuffle_v4p0_v3p0__5_3_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13717,14 +13984,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13773,12 +14036,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13840,12 +14100,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13907,12 +14164,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13974,12 +14228,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14028,14 +14279,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14048,20 +14295,48 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_4() {
 }
 
 define void @s_shuffle_v4p0_v3p0__u_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__u_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 poison, i32 5, i32 5, i32 5>
@@ -14119,12 +14394,9 @@ define void @s_shuffle_v4p0_v3p0__0_5_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14186,12 +14458,9 @@ define void @s_shuffle_v4p0_v3p0__1_5_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14253,12 +14522,9 @@ define void @s_shuffle_v4p0_v3p0__2_5_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14271,20 +14537,48 @@ define void @s_shuffle_v4p0_v3p0__2_5_5_5() {
 }
 
 define void @s_shuffle_v4p0_v3p0__3_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__3_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 3, i32 5, i32 5, i32 5>
@@ -14293,22 +14587,53 @@ define void @s_shuffle_v4p0_v3p0__3_5_5_5() {
 }
 
 define void @s_shuffle_v4p0_v3p0__4_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__4_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__4_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 4, i32 5, i32 5, i32 5>
@@ -14317,20 +14642,48 @@ define void @s_shuffle_v4p0_v3p0__4_5_5_5() {
 }
 
 define void @s_shuffle_v4p0_v3p0__5_u_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_u_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_u_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 poison, i32 5, i32 5>
@@ -14388,12 +14741,9 @@ define void @s_shuffle_v4p0_v3p0__5_0_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14455,12 +14805,9 @@ define void @s_shuffle_v4p0_v3p0__5_1_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14522,12 +14869,9 @@ define void @s_shuffle_v4p0_v3p0__5_2_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s12
-; GFX942-NEXT:    s_mov_b32 s9, s13
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14584,14 +14928,10 @@ define void @s_shuffle_v4p0_v3p0__5_3_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14604,20 +14944,48 @@ define void @s_shuffle_v4p0_v3p0__5_3_5_5() {
 }
 
 define void @s_shuffle_v4p0_v3p0__5_4_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_4_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:13]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:13]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:13]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_4_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:13]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> <i32 5, i32 4, i32 5, i32 5>
@@ -14666,12 +15034,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14737,14 +15102,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14810,14 +15171,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s16
-; GFX942-NEXT:    s_mov_b32 s9, s17
-; GFX942-NEXT:    s_mov_b32 s10, s16
-; GFX942-NEXT:    s_mov_b32 s11, s17
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14879,12 +15236,9 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14937,14 +15291,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15001,14 +15351,10 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll
index ce1c54129f706..5e61b0b51e280 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll
@@ -13289,8 +13289,7 @@ define void @s_shuffle_v4p0_v4p0__1_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13330,8 +13329,7 @@ define void @s_shuffle_v4p0_v4p0__2_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13375,8 +13373,7 @@ define void @s_shuffle_v4p0_v4p0__3_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13434,8 +13431,7 @@ define void @s_shuffle_v4p0_v4p0__5_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13476,8 +13472,7 @@ define void @s_shuffle_v4p0_v4p0__6_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13522,8 +13517,7 @@ define void @s_shuffle_v4p0_v4p0__7_u_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13582,10 +13576,8 @@ define void @s_shuffle_v4p0_v4p0__7_0_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13639,8 +13631,7 @@ define void @s_shuffle_v4p0_v4p0__7_1_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13698,10 +13689,8 @@ define void @s_shuffle_v4p0_v4p0__7_2_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13755,10 +13744,8 @@ define void @s_shuffle_v4p0_v4p0__7_3_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13807,10 +13794,8 @@ define void @s_shuffle_v4p0_v4p0__7_4_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13823,18 +13808,43 @@ define void @s_shuffle_v4p0_v4p0__7_4_u_u() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_5_u_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_5_u_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_u_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_u_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_u_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 poison, i32 poison>
@@ -13879,10 +13889,8 @@ define void @s_shuffle_v4p0_v4p0__7_6_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13927,10 +13935,8 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -13989,13 +13995,11 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14054,13 +14058,11 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14114,10 +14116,8 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14175,12 +14175,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14229,12 +14226,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14283,12 +14277,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14301,20 +14292,48 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_u() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_7_6_u() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_6_u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_u:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_u:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_u:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 poison>
@@ -14363,12 +14382,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14434,14 +14450,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14507,14 +14519,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14580,14 +14588,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14649,12 +14653,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14711,14 +14712,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14771,14 +14768,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14835,14 +14828,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14855,22 +14844,53 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_6() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -14919,12 +14939,9 @@ define void @s_shuffle_v4p0_v4p0__u_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -14936,22 +14953,53 @@ define void @s_shuffle_v4p0_v4p0__u_0_0_0() {
 }
 
 define void @s_shuffle_v4p0_v4p0__0_0_0_0() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__0_0_0_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s9
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s9
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_0_0_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s8
+; GFX900-NEXT:    s_mov_b32 s11, s9
+; GFX900-NEXT:    s_mov_b32 s12, s8
+; GFX900-NEXT:    s_mov_b32 s13, s9
+; GFX900-NEXT:    s_mov_b32 s14, s8
+; GFX900-NEXT:    s_mov_b32 s15, s9
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_0_0_0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s8
+; GFX90A-NEXT:    s_mov_b32 s11, s9
+; GFX90A-NEXT:    s_mov_b32 s12, s8
+; GFX90A-NEXT:    s_mov_b32 s13, s9
+; GFX90A-NEXT:    s_mov_b32 s14, s8
+; GFX90A-NEXT:    s_mov_b32 s15, s9
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_0_0_0:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[8:9]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> zeroinitializer
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -15003,14 +15051,10 @@ define void @s_shuffle_v4p0_v4p0__1_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15062,14 +15106,10 @@ define void @s_shuffle_v4p0_v4p0__2_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15125,14 +15165,10 @@ define void @s_shuffle_v4p0_v4p0__3_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15184,12 +15220,9 @@ define void @s_shuffle_v4p0_v4p0__4_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15251,17 +15284,13 @@ define void @s_shuffle_v4p0_v4p0__5_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15324,15 +15353,12 @@ define void @s_shuffle_v4p0_v4p0__6_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15395,17 +15421,13 @@ define void @s_shuffle_v4p0_v4p0__7_0_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15464,15 +15486,12 @@ define void @s_shuffle_v4p0_v4p0__7_u_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15535,17 +15554,13 @@ define void @s_shuffle_v4p0_v4p0__7_1_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15611,14 +15626,10 @@ define void @s_shuffle_v4p0_v4p0__7_2_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15680,14 +15691,10 @@ define void @s_shuffle_v4p0_v4p0__7_3_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15750,17 +15757,13 @@ define void @s_shuffle_v4p0_v4p0__7_4_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15822,12 +15825,9 @@ define void @s_shuffle_v4p0_v4p0__7_5_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15893,14 +15893,10 @@ define void @s_shuffle_v4p0_v4p0__7_6_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -15963,15 +15959,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16030,13 +16023,11 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s0
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16099,15 +16090,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16169,14 +16157,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16238,14 +16222,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16304,15 +16284,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s0
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16375,15 +16352,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s0
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16445,12 +16419,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16463,20 +16434,48 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_0() {
 }
 
 define void @s_shuffle_v4p0_v4p0__u_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__u_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -16484,89 +16483,210 @@ define void @s_shuffle_v4p0_v4p0__u_1_1_1() {
 }
 
 define void @s_shuffle_v4p0_v4p0__0_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__0_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__1_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__1_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__2_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__2_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__1_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__2_1_1_1() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
 define void @s_shuffle_v4p0_v4p0__3_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__3_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -16574,20 +16694,48 @@ define void @s_shuffle_v4p0_v4p0__3_1_1_1() {
 }
 
 define void @s_shuffle_v4p0_v4p0__4_1_1_1() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_1_1_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_1_1_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_1_1_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__4_1_1_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 4, i32 1, i32 1, i32 1>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
@@ -16644,12 +16792,9 @@ define void @s_shuffle_v4p0_v4p0__5_1_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16711,12 +16856,9 @@ define void @s_shuffle_v4p0_v4p0__6_1_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16778,12 +16920,9 @@ define void @s_shuffle_v4p0_v4p0__7_1_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16842,15 +16981,12 @@ define void @s_shuffle_v4p0_v4p0__7_u_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16913,17 +17049,13 @@ define void @s_shuffle_v4p0_v4p0__7_0_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -16985,14 +17117,10 @@ define void @s_shuffle_v4p0_v4p0__7_2_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17054,14 +17182,10 @@ define void @s_shuffle_v4p0_v4p0__7_3_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17124,17 +17248,13 @@ define void @s_shuffle_v4p0_v4p0__7_4_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17196,12 +17316,9 @@ define void @s_shuffle_v4p0_v4p0__7_5_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17267,14 +17384,10 @@ define void @s_shuffle_v4p0_v4p0__7_6_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17337,15 +17450,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17404,13 +17514,11 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s2
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17473,15 +17581,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17539,14 +17644,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17604,14 +17705,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17670,15 +17767,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s2
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17741,15 +17835,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s14, s2
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17811,12 +17902,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_1() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -17829,149 +17917,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_1() {
 }
 
 define void @s_shuffle_v4p0_v4p0__u_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 poison, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__0_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__0_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__1_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__1_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__2_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__2_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__3_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__3_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__4_2_2_2() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_2_2_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 4, i32 2, i32 2, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__5_2_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2:
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -17981,17 +17932,12 @@ define void @s_shuffle_v4p0_v4p0__5_2_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
@@ -18001,42 +17947,31 @@ define void @s_shuffle_v4p0_v4p0__5_2_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__u_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 2, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 poison, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__6_2_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2:
+define void @s_shuffle_v4p0_v4p0__0_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -18046,15 +17981,12 @@ define void @s_shuffle_v4p0_v4p0__6_2_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
@@ -18064,42 +17996,31 @@ define void @s_shuffle_v4p0_v4p0__6_2_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 2, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_2_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2:
+define void @s_shuffle_v4p0_v4p0__1_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s10, s12
@@ -18111,15 +18032,12 @@ define void @s_shuffle_v4p0_v4p0__7_2_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
@@ -18131,44 +18049,36 @@ define void @s_shuffle_v4p0_v4p0__7_2_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_u_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2:
+define void @s_shuffle_v4p0_v4p0__2_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18176,17 +18086,16 @@ define void @s_shuffle_v4p0_v4p0__7_u_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18194,115 +18103,88 @@ define void @s_shuffle_v4p0_v4p0__7_u_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_0_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2:
+define void @s_shuffle_v4p0_v4p0__3_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_1_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2:
+define void @s_shuffle_v4p0_v4p0__4_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18310,17 +18192,14 @@ define void @s_shuffle_v4p0_v4p0__7_1_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18328,32 +18207,26 @@ define void @s_shuffle_v4p0_v4p0__7_1_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__4_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 4, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_3_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2:
+define void @s_shuffle_v4p0_v4p0__5_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18362,10 +18235,10 @@ define void @s_shuffle_v4p0_v4p0__7_3_2_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18373,7 +18246,7 @@ define void @s_shuffle_v4p0_v4p0__7_3_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18382,10 +18255,10 @@ define void @s_shuffle_v4p0_v4p0__7_3_2_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18393,7 +18266,7 @@ define void @s_shuffle_v4p0_v4p0__7_3_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18402,25 +18275,22 @@ define void @s_shuffle_v4p0_v4p0__7_3_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_4_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2:
+define void @s_shuffle_v4p0_v4p0__6_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18429,10 +18299,8 @@ define void @s_shuffle_v4p0_v4p0__7_4_2_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18440,7 +18308,7 @@ define void @s_shuffle_v4p0_v4p0__7_4_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18449,10 +18317,8 @@ define void @s_shuffle_v4p0_v4p0__7_4_2_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18460,7 +18326,7 @@ define void @s_shuffle_v4p0_v4p0__7_4_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18469,65 +18335,62 @@ define void @s_shuffle_v4p0_v4p0__7_4_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_5_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2:
+define void @s_shuffle_v4p0_v4p0__7_2_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18536,37 +18399,32 @@ define void @s_shuffle_v4p0_v4p0__7_5_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_6_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2:
+define void @s_shuffle_v4p0_v4p0__7_u_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18574,19 +18432,17 @@ define void @s_shuffle_v4p0_v4p0__7_6_2_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18594,7 +18450,7 @@ define void @s_shuffle_v4p0_v4p0__7_6_2_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18603,61 +18459,65 @@ define void @s_shuffle_v4p0_v4p0__7_6_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_2_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2:
+define void @s_shuffle_v4p0_v4p0__7_0_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s12
-; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s12
-; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18666,35 +18526,33 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_u_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2:
+define void @s_shuffle_v4p0_v4p0__7_1_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[16:23]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
 ; GFX900-NEXT:    s_mov_b32 s14, s12
 ; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18702,17 +18560,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[16:23]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
 ; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18720,7 +18578,7 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18729,61 +18587,61 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_0_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2:
+define void @s_shuffle_v4p0_v4p0__7_3_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18792,67 +18650,62 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_1_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2:
+define void @s_shuffle_v4p0_v4p0__7_4_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18861,39 +18714,34 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_3_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2:
+define void @s_shuffle_v4p0_v4p0__7_5_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
 ; GFX900-NEXT:    s_mov_b32 s14, s16
 ; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -18901,19 +18749,19 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_2() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
 ; GFX90A-NEXT:    s_mov_b32 s14, s16
 ; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -18921,7 +18769,7 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_2() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -18930,203 +18778,182 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_4_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2:
+define void @s_shuffle_v4p0_v4p0__7_6_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[16:23]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[16:23]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_5_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2:
+define void @s_shuffle_v4p0_v4p0__7_7_2_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 2>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_6_2() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2:
+define void @s_shuffle_v4p0_v4p0__7_7_u_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19135,197 +18962,58 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 2>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__u_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 poison, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__0_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__0_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 0, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__1_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__1_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 1, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__2_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__2_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__3_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__3_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__4_3_3_3() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_3_3_3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 4, i32 3, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__5_3_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3:
+define void @s_shuffle_v4p0_v4p0__7_7_0_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19334,61 +19022,63 @@ define void @s_shuffle_v4p0_v4p0__5_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 3, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__6_3_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3:
+define void @s_shuffle_v4p0_v4p0__7_7_1_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19397,65 +19087,63 @@ define void @s_shuffle_v4p0_v4p0__6_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 3, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_3_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3:
+define void @s_shuffle_v4p0_v4p0__7_7_3_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19464,86 +19152,87 @@ define void @s_shuffle_v4p0_v4p0__7_3_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_u_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3:
+define void @s_shuffle_v4p0_v4p0__7_7_4_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_0_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3:
+define void @s_shuffle_v4p0_v4p0__7_7_5_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19554,18 +19243,16 @@ define void @s_shuffle_v4p0_v4p0__7_0_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19576,83 +19263,81 @@ define void @s_shuffle_v4p0_v4p0__7_0_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_1_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3:
+define void @s_shuffle_v4p0_v4p0__7_7_6_2() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -19661,35 +19346,29 @@ define void @s_shuffle_v4p0_v4p0__7_1_3_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 2>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_2_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3:
+define void @s_shuffle_v4p0_v4p0__u_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19697,19 +19376,14 @@ define void @s_shuffle_v4p0_v4p0__7_2_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19717,46 +19391,33 @@ define void @s_shuffle_v4p0_v4p0__7_2_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__u_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 poison, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_4_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3:
+define void @s_shuffle_v4p0_v4p0__0_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19764,19 +19425,14 @@ define void @s_shuffle_v4p0_v4p0__7_4_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19784,113 +19440,89 @@ define void @s_shuffle_v4p0_v4p0__7_4_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 0, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_5_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3:
+define void @s_shuffle_v4p0_v4p0__1_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s12, s18
-; GFX900-NEXT:    s_mov_b32 s13, s19
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s12, s18
-; GFX90A-NEXT:    s_mov_b32 s13, s19
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 1, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_6_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3:
+define void @s_shuffle_v4p0_v4p0__2_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19898,19 +19530,16 @@ define void @s_shuffle_v4p0_v4p0__7_6_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19918,44 +19547,36 @@ define void @s_shuffle_v4p0_v4p0__7_6_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_3_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3:
+define void @s_shuffle_v4p0_v4p0__3_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -19963,17 +19584,16 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19981,127 +19601,116 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_u_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3:
+define void @s_shuffle_v4p0_v4p0__4_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__4_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 4, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_0_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3:
+define void @s_shuffle_v4p0_v4p0__5_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20110,67 +19719,58 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_1_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3:
+define void @s_shuffle_v4p0_v4p0__6_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20179,27 +19779,22 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_2_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3:
+define void @s_shuffle_v4p0_v4p0__7_3_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -20210,12 +19805,16 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -20226,12 +19825,16 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20240,23 +19843,22 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_4_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3:
+define void @s_shuffle_v4p0_v4p0__7_u_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -20267,14 +19869,14 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_3() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -20285,14 +19887,14 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_3() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20301,61 +19903,65 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_5_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3:
+define void @s_shuffle_v4p0_v4p0__7_0_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20364,65 +19970,59 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 3>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_6_3() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3:
+define void @s_shuffle_v4p0_v4p0__7_1_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[16:23]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s14
-; GFX900-NEXT:    s_mov_b32 s9, s15
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
-; GFX900-NEXT:    s_mov_b32 s14, s18
-; GFX900-NEXT:    s_mov_b32 s15, s19
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[16:23]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s14
-; GFX90A-NEXT:    s_mov_b32 s9, s15
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s18
-; GFX90A-NEXT:    s_mov_b32 s15, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -20431,584 +20031,609 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 3>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__u_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 poison, i32 4, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__0_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4:
+define void @s_shuffle_v4p0_v4p0__7_2_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__1_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4:
+define void @s_shuffle_v4p0_v4p0__7_4_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4:
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 1, i32 4, i32 4, i32 4>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__2_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4:
+define void @s_shuffle_v4p0_v4p0__7_5_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s18
+; GFX900-NEXT:    s_mov_b32 s13, s19
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s18
+; GFX90A-NEXT:    s_mov_b32 s13, s19
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 2, i32 4, i32 4, i32 4>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__3_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4:
+define void @s_shuffle_v4p0_v4p0__7_6_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 3, i32 4, i32 4, i32 4>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__4_4_4_4() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_4_4_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__5_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4:
+define void @s_shuffle_v4p0_v4p0__7_7_3_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 4, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__6_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4:
+define void @s_shuffle_v4p0_v4p0__7_7_u_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 4, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_4_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4:
+define void @s_shuffle_v4p0_v4p0__7_7_0_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_u_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4:
+define void @s_shuffle_v4p0_v4p0__7_7_1_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_0_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4:
+define void @s_shuffle_v4p0_v4p0__7_7_2_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_1_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4:
+define void @s_shuffle_v4p0_v4p0__7_7_4_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -21017,25 +20642,22 @@ define void @s_shuffle_v4p0_v4p0__7_1_4_4() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_2_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4:
+define void @s_shuffle_v4p0_v4p0__7_7_5_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -21046,18 +20668,14 @@ define void @s_shuffle_v4p0_v4p0__7_2_4_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21068,304 +20686,240 @@ define void @s_shuffle_v4p0_v4p0__7_2_4_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s16
-; GFX942-NEXT:    s_mov_b32 s13, s17
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_3_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4:
+define void @s_shuffle_v4p0_v4p0__7_7_6_3() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s18
+; GFX900-NEXT:    s_mov_b32 s15, s19
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s18
+; GFX90A-NEXT:    s_mov_b32 s15, s19
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s16
-; GFX942-NEXT:    s_mov_b32 s13, s17
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 3>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_5_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4:
+define void @s_shuffle_v4p0_v4p0__u_4_4_4() {
+; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_4_4_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:15]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 poison, i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__0_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s6
-; GFX900-NEXT:    s_mov_b32 s11, s7
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s6
-; GFX90A-NEXT:    s_mov_b32 s11, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_6_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4:
+define void @s_shuffle_v4p0_v4p0__1_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s20
-; GFX900-NEXT:    s_mov_b32 s11, s21
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s20
-; GFX90A-NEXT:    s_mov_b32 s11, s21
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 1, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_4_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4:
+define void @s_shuffle_v4p0_v4p0__2_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 2, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_u_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4:
+define void @s_shuffle_v4p0_v4p0__3_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -21373,14 +20927,12 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21388,190 +20940,171 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 3, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_0_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4:
+define void @s_shuffle_v4p0_v4p0__4_4_4_4() {
+; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_4_4_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:15]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__5_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s22
-; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s4
 ; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s22
-; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s4
 ; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_1_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4:
+define void @s_shuffle_v4p0_v4p0__6_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[16:23]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s22
-; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s22
-; GFX900-NEXT:    s_mov_b32 s11, s23
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[16:23]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s22
-; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s22
-; GFX90A-NEXT:    s_mov_b32 s11, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_2_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4:
+define void @s_shuffle_v4p0_v4p0__7_4_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    s_mov_b32 s14, s4
 ; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -21579,17 +21112,18 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    s_mov_b32 s14, s4
 ; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21597,46 +21131,38 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_3_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4:
+define void @s_shuffle_v4p0_v4p0__7_u_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    s_mov_b32 s14, s4
 ; GFX900-NEXT:    s_mov_b32 s15, s5
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -21644,19 +21170,16 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    s_mov_b32 s14, s4
 ; GFX90A-NEXT:    s_mov_b32 s15, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21664,107 +21187,110 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[16:23]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s22
-; GFX942-NEXT:    s_mov_b32 s9, s23
-; GFX942-NEXT:    s_mov_b32 s10, s22
-; GFX942-NEXT:    s_mov_b32 s11, s23
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_5_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4:
+define void @s_shuffle_v4p0_v4p0__7_0_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s4
-; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s4
-; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 4, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_6_4() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4:
+define void @s_shuffle_v4p0_v4p0__7_1_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[16:23]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s22
 ; GFX900-NEXT:    s_mov_b32 s9, s23
-; GFX900-NEXT:    s_mov_b32 s10, s22
-; GFX900-NEXT:    s_mov_b32 s11, s23
-; GFX900-NEXT:    s_mov_b32 s12, s20
-; GFX900-NEXT:    s_mov_b32 s13, s21
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
 ; GFX900-NEXT:    s_mov_b32 s14, s16
 ; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -21772,18 +21298,19 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_4() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[16:23]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s22
 ; GFX90A-NEXT:    s_mov_b32 s9, s23
-; GFX90A-NEXT:    s_mov_b32 s10, s22
-; GFX90A-NEXT:    s_mov_b32 s11, s23
-; GFX90A-NEXT:    s_mov_b32 s12, s20
-; GFX90A-NEXT:    s_mov_b32 s13, s21
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
 ; GFX90A-NEXT:    s_mov_b32 s14, s16
 ; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21791,65 +21318,1353 @@ define void @s_shuffle_v4p0_v4p0__7_7_6_4() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s0
-; GFX942-NEXT:    s_mov_b32 s15, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 4>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_2_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_3_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_5_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s6
+; GFX900-NEXT:    s_mov_b32 s11, s7
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s6
+; GFX90A-NEXT:    s_mov_b32 s11, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_6_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s20
+; GFX900-NEXT:    s_mov_b32 s11, s21
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s20
+; GFX90A-NEXT:    s_mov_b32 s11, s21
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_7_4_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_7_u_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_7_0_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s22
+; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s22
+; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_7_1_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s22
+; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s22
+; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_7_2_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_7_3_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[16:23]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[22:23]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_7_5_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s4
+; GFX900-NEXT:    s_mov_b32 s15, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s4
+; GFX90A-NEXT:    s_mov_b32 s15, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 4>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_7_6_4() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[16:23]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s22
+; GFX900-NEXT:    s_mov_b32 s9, s23
+; GFX900-NEXT:    s_mov_b32 s10, s22
+; GFX900-NEXT:    s_mov_b32 s11, s23
+; GFX900-NEXT:    s_mov_b32 s12, s20
+; GFX900-NEXT:    s_mov_b32 s13, s21
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[16:23]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s22
+; GFX90A-NEXT:    s_mov_b32 s9, s23
+; GFX90A-NEXT:    s_mov_b32 s10, s22
+; GFX90A-NEXT:    s_mov_b32 s11, s23
+; GFX90A-NEXT:    s_mov_b32 s12, s20
+; GFX90A-NEXT:    s_mov_b32 s13, s21
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 4>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
 define void @s_shuffle_v4p0_v4p0__u_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__u_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 poison, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__0_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 0, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__1_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 1, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__2_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__3_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 3, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__4_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__4_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 4, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__5_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__6_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_5_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s10
+; GFX900-NEXT:    s_mov_b32 s13, s11
+; GFX900-NEXT:    s_mov_b32 s14, s10
+; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s10
+; GFX90A-NEXT:    s_mov_b32 s13, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s10
+; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 5, i32 5>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_u_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 poison, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__0_5_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5:
+define void @s_shuffle_v4p0_v4p0__7_0_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s14
-; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -21857,17 +22672,19 @@ define void @s_shuffle_v4p0_v4p0__0_5_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s14
-; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -21875,74 +22692,69 @@ define void @s_shuffle_v4p0_v4p0__0_5_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s2
-; GFX942-NEXT:    s_mov_b32 s11, s3
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 0, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__1_5_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5:
+define void @s_shuffle_v4p0_v4p0__7_1_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
-; GFX900-NEXT:    s_mov_b32 s14, s10
-; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s10
-; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -21951,253 +22763,150 @@ define void @s_shuffle_v4p0_v4p0__1_5_5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 1, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__2_5_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5:
+define void @s_shuffle_v4p0_v4p0__7_2_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
-; GFX900-NEXT:    s_mov_b32 s14, s10
-; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s10
-; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__3_5_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5:
+define void @s_shuffle_v4p0_v4p0__7_3_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s10
-; GFX900-NEXT:    s_mov_b32 s13, s11
-; GFX900-NEXT:    s_mov_b32 s14, s10
-; GFX900-NEXT:    s_mov_b32 s15, s11
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s10
-; GFX90A-NEXT:    s_mov_b32 s13, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s10
-; GFX90A-NEXT:    s_mov_b32 s15, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ; def s[12:19]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s10
-; GFX942-NEXT:    s_mov_b32 s13, s11
-; GFX942-NEXT:    s_mov_b32 s14, s10
-; GFX942-NEXT:    s_mov_b32 s15, s11
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 3, i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__4_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 4, i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__5_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__5_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__6_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__6_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 5, i32 5, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__7_5_5_5() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_5_5_5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s10
-; GFX9-NEXT:    s_mov_b32 s13, s11
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s15, s11
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_u_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5:
+define void @s_shuffle_v4p0_v4p0__7_4_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22205,6 +22914,8 @@ define void @s_shuffle_v4p0_v4p0__7_u_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s11, s5
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
@@ -22214,7 +22925,7 @@ define void @s_shuffle_v4p0_v4p0__7_u_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22222,6 +22933,8 @@ define void @s_shuffle_v4p0_v4p0__7_u_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s11, s5
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
@@ -22231,43 +22944,38 @@ define void @s_shuffle_v4p0_v4p0__7_u_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_0_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5:
+define void @s_shuffle_v4p0_v4p0__7_6_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
 ; GFX900-NEXT:    s_mov_b32 s12, s14
 ; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22275,19 +22983,16 @@ define void @s_shuffle_v4p0_v4p0__7_0_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    s_mov_b32 s12, s14
 ; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22295,114 +23000,92 @@ define void @s_shuffle_v4p0_v4p0__7_0_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 0, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_1_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5:
+define void @s_shuffle_v4p0_v4p0__7_7_5_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s6
+; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s6
+; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:15]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 1, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_2_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5:
+define void @s_shuffle_v4p0_v4p0__7_7_u_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s12
-; GFX900-NEXT:    s_mov_b32 s11, s13
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22410,21 +23093,14 @@ define void @s_shuffle_v4p0_v4p0__7_2_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s12
-; GFX90A-NEXT:    s_mov_b32 s11, s13
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22432,34 +23108,28 @@ define void @s_shuffle_v4p0_v4p0__7_2_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 2, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_3_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5:
+define void @s_shuffle_v4p0_v4p0__7_7_0_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22470,14 +23140,16 @@ define void @s_shuffle_v4p0_v4p0__7_3_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22488,174 +23160,176 @@ define void @s_shuffle_v4p0_v4p0__7_3_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ; def s[4:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 3, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_4_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5:
+define void @s_shuffle_v4p0_v4p0__7_7_1_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    s_mov_b32 s11, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s18
+; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s10, s18
+; GFX900-NEXT:    s_mov_b32 s11, s19
 ; GFX900-NEXT:    s_mov_b32 s12, s6
 ; GFX900-NEXT:    s_mov_b32 s13, s7
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    s_mov_b32 s11, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s18
+; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s10, s18
+; GFX90A-NEXT:    s_mov_b32 s11, s19
 ; GFX90A-NEXT:    s_mov_b32 s12, s6
 ; GFX90A-NEXT:    s_mov_b32 s13, s7
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 4, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_6_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5:
+define void @s_shuffle_v4p0_v4p0__7_7_2_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s14, s6
+; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s14, s6
+; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_5_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5:
+define void @s_shuffle_v4p0_v4p0__7_7_3_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22663,16 +23337,19 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22680,33 +23357,31 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 5, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_u_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5:
+define void @s_shuffle_v4p0_v4p0__7_7_4_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22714,6 +23389,8 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s10
 ; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s12, s4
+; GFX900-NEXT:    s_mov_b32 s13, s5
 ; GFX900-NEXT:    s_mov_b32 s14, s6
 ; GFX900-NEXT:    s_mov_b32 s15, s7
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -22721,7 +23398,7 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_5() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22729,6 +23406,8 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s10
 ; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s12, s4
+; GFX90A-NEXT:    s_mov_b32 s13, s5
 ; GFX90A-NEXT:    s_mov_b32 s14, s6
 ; GFX90A-NEXT:    s_mov_b32 s15, s7
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -22736,201 +23415,175 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_5() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_0_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5:
+define void @s_shuffle_v4p0_v4p0__7_7_6_5() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
 ; GFX900-NEXT:    s_mov_b32 s10, s18
 ; GFX900-NEXT:    s_mov_b32 s11, s19
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
 ; GFX90A-NEXT:    s_mov_b32 s10, s18
 ; GFX90A-NEXT:    s_mov_b32 s11, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 0, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 5>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_1_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_5:
+define void @s_shuffle_v4p0_v4p0__u_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s18
-; GFX900-NEXT:    s_mov_b32 s11, s19
-; GFX900-NEXT:    s_mov_b32 s12, s6
-; GFX900-NEXT:    s_mov_b32 s13, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s18
-; GFX90A-NEXT:    s_mov_b32 s11, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s6
-; GFX90A-NEXT:    s_mov_b32 s13, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_1_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__u_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:11]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s10
-; GFX942-NEXT:    s_mov_b32 s9, s11
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 1, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 poison, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_2_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_5:
+define void @s_shuffle_v4p0_v4p0__0_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s10, s16
+; GFX900-NEXT:    s_mov_b32 s11, s17
+; GFX900-NEXT:    s_mov_b32 s12, s16
+; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b32 s15, s17
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s12, s16
+; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s14, s16
+; GFX90A-NEXT:    s_mov_b32 s15, s17
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_2_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -22939,311 +23592,271 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 2, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 0, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_3_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_5:
+define void @s_shuffle_v4p0_v4p0__1_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ; def s[4:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s14
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ; def s[4:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s14
-; GFX90A-NEXT:    s_mov_b32 s13, s15
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_3_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[12:19]
+; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 1, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_4_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5:
+define void @s_shuffle_v4p0_v4p0__2_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
+; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s11
-; GFX900-NEXT:    s_mov_b32 s12, s4
-; GFX900-NEXT:    s_mov_b32 s13, s5
-; GFX900-NEXT:    s_mov_b32 s14, s6
-; GFX900-NEXT:    s_mov_b32 s15, s7
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s16
+; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
+; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s11
-; GFX90A-NEXT:    s_mov_b32 s12, s4
-; GFX90A-NEXT:    s_mov_b32 s13, s5
-; GFX90A-NEXT:    s_mov_b32 s14, s6
-; GFX90A-NEXT:    s_mov_b32 s15, s7
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 4, i32 5>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 2, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__7_7_6_5() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5:
+define void @s_shuffle_v4p0_v4p0__3_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[12:19]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s18
 ; GFX900-NEXT:    s_mov_b32 s9, s19
-; GFX900-NEXT:    s_mov_b32 s10, s18
-; GFX900-NEXT:    s_mov_b32 s11, s19
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[12:19]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s18
 ; GFX90A-NEXT:    s_mov_b32 s9, s19
-; GFX90A-NEXT:    s_mov_b32 s10, s18
-; GFX90A-NEXT:    s_mov_b32 s11, s19
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s2
-; GFX942-NEXT:    s_mov_b32 s15, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 5>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__u_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 poison, i32 6, i32 6, i32 6>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 3, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__0_6_6_6() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_6_6_6:
+define void @s_shuffle_v4p0_v4p0__4_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s10, s16
-; GFX900-NEXT:    s_mov_b32 s11, s17
-; GFX900-NEXT:    s_mov_b32 s12, s16
-; GFX900-NEXT:    s_mov_b32 s13, s17
-; GFX900-NEXT:    s_mov_b32 s14, s16
-; GFX900-NEXT:    s_mov_b32 s15, s17
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_6_6_6:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s10, s16
-; GFX90A-NEXT:    s_mov_b32 s11, s17
-; GFX90A-NEXT:    s_mov_b32 s12, s16
-; GFX90A-NEXT:    s_mov_b32 s13, s17
-; GFX90A-NEXT:    s_mov_b32 s14, s16
-; GFX90A-NEXT:    s_mov_b32 s15, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__0_6_6_6:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__4_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 0, i32 6, i32 6, i32 6>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 4, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__1_6_6_6() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_6_6_6:
+define void @s_shuffle_v4p0_v4p0__5_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:11]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s7
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -23253,17 +23866,14 @@ define void @s_shuffle_v4p0_v4p0__1_6_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_6_6_6:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:11]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
@@ -23273,44 +23883,35 @@ define void @s_shuffle_v4p0_v4p0__1_6_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_6_6_6:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 1, i32 6, i32 6, i32 6>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__2_6_6_6() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_6_6_6:
+define void @s_shuffle_v4p0_v4p0__6_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s16
-; GFX900-NEXT:    s_mov_b32 s9, s17
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -23320,17 +23921,14 @@ define void @s_shuffle_v4p0_v4p0__2_6_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_6_6_6:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s16
-; GFX90A-NEXT:    s_mov_b32 s9, s17
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
@@ -23340,44 +23938,35 @@ define void @s_shuffle_v4p0_v4p0__2_6_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_6_6_6:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 2, i32 6, i32 6, i32 6>
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
   call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
   ret void
 }
 
-define void @s_shuffle_v4p0_v4p0__3_6_6_6() {
-; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_6_6_6:
+define void @s_shuffle_v4p0_v4p0__7_6_6_6() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_6_6:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[12:19]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:15]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s18
-; GFX900-NEXT:    s_mov_b32 s9, s19
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
 ; GFX900-NEXT:    s_mov_b32 s10, s12
 ; GFX900-NEXT:    s_mov_b32 s11, s13
 ; GFX900-NEXT:    s_mov_b32 s14, s12
@@ -23387,17 +23976,14 @@ define void @s_shuffle_v4p0_v4p0__3_6_6_6() {
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_6_6_6:
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_6_6:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[12:19]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s18
-; GFX90A-NEXT:    s_mov_b32 s9, s19
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
 ; GFX90A-NEXT:    s_mov_b32 s10, s12
 ; GFX90A-NEXT:    s_mov_b32 s11, s13
 ; GFX90A-NEXT:    s_mov_b32 s14, s12
@@ -23407,119 +23993,19 @@ define void @s_shuffle_v4p0_v4p0__3_6_6_6() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_6_6_6:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_6_6:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s12
-; GFX942-NEXT:    s_mov_b32 s11, s13
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 3, i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__4_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 4, i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__5_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__5_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__6_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__6_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__7_6_6_6() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_6_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 6, i32 6>
@@ -23528,20 +24014,48 @@ define void @s_shuffle_v4p0_v4p0__7_6_6_6() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_u_6_6() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_u_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_6_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 6, i32 6>
@@ -23599,12 +24113,9 @@ define void @s_shuffle_v4p0_v4p0__7_0_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23666,12 +24177,9 @@ define void @s_shuffle_v4p0_v4p0__7_1_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23733,12 +24241,9 @@ define void @s_shuffle_v4p0_v4p0__7_2_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23800,12 +24305,9 @@ define void @s_shuffle_v4p0_v4p0__7_3_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s12
-; GFX942-NEXT:    s_mov_b32 s15, s13
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23862,14 +24364,10 @@ define void @s_shuffle_v4p0_v4p0__7_4_6_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s4
-; GFX942-NEXT:    s_mov_b32 s13, s5
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -23882,20 +24380,48 @@ define void @s_shuffle_v4p0_v4p0__7_4_6_6() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_5_6_6() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_5_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_6_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 6, i32 6>
@@ -23904,22 +24430,53 @@ define void @s_shuffle_v4p0_v4p0__7_5_6_6() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_7_6_6() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_6_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s14, s12
-; GFX9-NEXT:    s_mov_b32 s15, s13
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_6:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s14, s12
+; GFX900-NEXT:    s_mov_b32 s15, s13
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_6:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s14, s12
+; GFX90A-NEXT:    s_mov_b32 s15, s13
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 6>
@@ -23968,12 +24525,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24039,14 +24593,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24112,14 +24662,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24181,12 +24727,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24252,14 +24795,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s18
-; GFX942-NEXT:    s_mov_b32 s9, s19
-; GFX942-NEXT:    s_mov_b32 s10, s18
-; GFX942-NEXT:    s_mov_b32 s11, s19
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s16
-; GFX942-NEXT:    s_mov_b32 s15, s17
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[18:19]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[16:17]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24312,14 +24851,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24376,14 +24911,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_6() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s4
-; GFX942-NEXT:    s_mov_b32 s15, s5
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[4:5]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24396,20 +24927,48 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_6() {
 }
 
 define void @s_shuffle_v4p0_v4p0__u_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__u_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__u_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 poison, i32 7, i32 7, i32 7>
@@ -24467,12 +25026,9 @@ define void @s_shuffle_v4p0_v4p0__0_7_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24534,12 +25090,9 @@ define void @s_shuffle_v4p0_v4p0__1_7_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s3
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24601,12 +25154,9 @@ define void @s_shuffle_v4p0_v4p0__2_7_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s4
-; GFX942-NEXT:    s_mov_b32 s9, s5
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24659,117 +25209,232 @@ define void @s_shuffle_v4p0_v4p0__3_7_7_7() {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_7_7_7:
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:7]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 3, i32 7, i32 7, i32 7>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__4_7_7_7() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__4_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 4, i32 7, i32 7, i32 7>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__5_7_7_7() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s10
+; GFX900-NEXT:    s_mov_b32 s9, s11
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s10
+; GFX90A-NEXT:    s_mov_b32 s9, s11
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 7, i32 7, i32 7>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__6_7_7_7() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_7_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s12
+; GFX900-NEXT:    s_mov_b32 s9, s13
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_7_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s12
+; GFX90A-NEXT:    s_mov_b32 s9, s13
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_7_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
+  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
+  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 7, i32 7, i32 7>
+  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v4p0_v4p0__7_u_7_7() {
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_7_7:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:7]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 3, i32 7, i32 7, i32 7>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__4_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__4_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 4, i32 7, i32 7, i32 7>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__5_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__5_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 5, i32 7, i32 7, i32 7>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__6_7_7_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__6_7_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s12
-; GFX9-NEXT:    s_mov_b32 s9, s13
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <4 x ptr> asm "; def $0", "=s"()
-  %vec1 = call <4 x ptr> asm "; def $0", "=s"()
-  %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 6, i32 7, i32 7, i32 7>
-  call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v4p0_v4p0__7_u_7_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_u_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 poison, i32 7, i32 7>
@@ -24827,12 +25492,9 @@ define void @s_shuffle_v4p0_v4p0__7_0_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24894,12 +25556,9 @@ define void @s_shuffle_v4p0_v4p0__7_1_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -24961,12 +25620,9 @@ define void @s_shuffle_v4p0_v4p0__7_2_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s4
-; GFX942-NEXT:    s_mov_b32 s11, s5
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25028,12 +25684,9 @@ define void @s_shuffle_v4p0_v4p0__7_3_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s14
-; GFX942-NEXT:    s_mov_b32 s13, s15
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25090,14 +25743,10 @@ define void @s_shuffle_v4p0_v4p0__7_4_7_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    s_mov_b32 s11, s1
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25110,20 +25759,48 @@ define void @s_shuffle_v4p0_v4p0__7_4_7_7() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_5_7_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_5_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_5_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 5, i32 7, i32 7>
@@ -25132,22 +25809,53 @@ define void @s_shuffle_v4p0_v4p0__7_5_7_7() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_6_7_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_6_7_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s12
-; GFX9-NEXT:    s_mov_b32 s11, s13
-; GFX9-NEXT:    s_mov_b32 s12, s14
-; GFX9-NEXT:    s_mov_b32 s13, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_7_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s12
+; GFX900-NEXT:    s_mov_b32 s11, s13
+; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b32 s13, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_7_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s12
+; GFX90A-NEXT:    s_mov_b32 s11, s13
+; GFX90A-NEXT:    s_mov_b32 s12, s14
+; GFX90A-NEXT:    s_mov_b32 s13, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_7_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 6, i32 7, i32 7>
@@ -25156,20 +25864,48 @@ define void @s_shuffle_v4p0_v4p0__7_6_7_7() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_7_u_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_u_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 poison, i32 7>
@@ -25227,12 +25963,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25294,12 +26027,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25361,12 +26091,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_2_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25428,12 +26155,9 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s14
-; GFX942-NEXT:    s_mov_b32 s9, s15
-; GFX942-NEXT:    s_mov_b32 s10, s14
-; GFX942-NEXT:    s_mov_b32 s11, s15
-; GFX942-NEXT:    s_mov_b32 s12, s6
-; GFX942-NEXT:    s_mov_b32 s13, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25486,14 +26210,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s0
-; GFX942-NEXT:    s_mov_b32 s13, s1
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25550,14 +26270,10 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_7() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:7]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s7
-; GFX942-NEXT:    s_mov_b32 s10, s6
-; GFX942-NEXT:    s_mov_b32 s11, s7
-; GFX942-NEXT:    s_mov_b32 s12, s2
-; GFX942-NEXT:    s_mov_b32 s13, s3
-; GFX942-NEXT:    s_mov_b32 s14, s6
-; GFX942-NEXT:    s_mov_b32 s15, s7
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GFX942-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:15]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -25570,20 +26286,48 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_7() {
 }
 
 define void @s_shuffle_v4p0_v4p0__7_7_6_7() {
-; GFX9-LABEL: s_shuffle_v4p0_v4p0__7_7_6_7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s14
-; GFX9-NEXT:    s_mov_b32 s9, s15
-; GFX9-NEXT:    s_mov_b32 s10, s14
-; GFX9-NEXT:    s_mov_b32 s11, s15
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:15]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_7:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s8, s14
+; GFX900-NEXT:    s_mov_b32 s9, s15
+; GFX900-NEXT:    s_mov_b32 s10, s14
+; GFX900-NEXT:    s_mov_b32 s11, s15
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:15]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_7:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s8, s14
+; GFX90A-NEXT:    s_mov_b32 s9, s15
+; GFX90A-NEXT:    s_mov_b32 s10, s14
+; GFX90A-NEXT:    s_mov_b32 s11, s15
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:15]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_6_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[14:15]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[14:15]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:15]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <4 x ptr> asm "; def $0", "=s"()
   %vec1 = call <4 x ptr> asm "; def $0", "=s"()
   %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
index 1e042d3b4a31f..69773bf265e8c 100644
--- a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
@@ -6,25 +6,22 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX942-LABEL: test:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], 0
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX942-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
-; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v12, s4
-; GFX942-NEXT:    v_mov_b32_e32 v13, s5
-; GFX942-NEXT:    v_mov_b32_e32 v4, s6
-; GFX942-NEXT:    v_mov_b32_e32 v5, s7
-; GFX942-NEXT:    v_mov_b32_e32 v6, s7
-; GFX942-NEXT:    v_mov_b32_e32 v7, s7
+; GFX942-NEXT:    v_mov_b32_e32 v8, s4
+; GFX942-NEXT:    v_mov_b32_e32 v9, s5
+; GFX942-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-NEXT:    v_mov_b32_e32 v1, s7
+; GFX942-NEXT:    v_mov_b32_e32 v2, s7
+; GFX942-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[12:13], v[4:7], v13
+; GFX942-NEXT:    v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v9
 ; GFX942-NEXT:    s_nop 6
-; GFX942-NEXT:    global_store_dword v0, v11, s[2:3] offset:12
+; GFX942-NEXT:    global_store_dword v10, v7, s[2:3] offset:12
 ; GFX942-NEXT:    s_endpgm
 entry:
   %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 0
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll
index d500a3e50f9f7..995ebc822f769 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll
@@ -229,9 +229,10 @@ define half @test_vector_reduce_fmax_v3half(<3 x half> %v) {
 ; GFX9-SDAG-LABEL: test_vector_reduce_fmax_v3half:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    s_movk_i32 s0, 0x7e00
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, s0
+; GFX9-SDAG-NEXT:    s_movk_i32 s0, 0xfe00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v1, s0, v1, v2
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX9-SDAG-NEXT:    s_nop 0
 ; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll
index 56e7e045e40eb..fe06485749651 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll
@@ -229,9 +229,10 @@ define half @test_vector_reduce_fmin_v3half(<3 x half> %v) {
 ; GFX9-SDAG-LABEL: test_vector_reduce_fmin_v3half:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX9-SDAG-NEXT:    s_movk_i32 s0, 0x7e00
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, s0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v1, s0, v1, v2
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX9-SDAG-NEXT:    s_nop 0
 ; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index d8264b5a091e1..776db1815a872 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -447,37 +447,36 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa
 ; GFX942-LABEL: v8i8_phi_zeroinit:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
-; GFX942-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX942-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 3, v2
+; GFX942-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v2
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v2
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx2 v[2:3], v1, s[8:9]
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[8:9]
 ; GFX942-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX942-NEXT:  ; %bb.1: ; %bb.1
-; GFX942-NEXT:    global_load_dwordx2 v[4:5], v1, s[10:11]
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX942-NEXT:    s_waitcnt vmcnt(1)
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    global_load_dwordx2 v[4:5], v3, s[10:11]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v2
 ; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX942-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
 ; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX942-NEXT:  .LBB9_2: ; %Flow
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX942-NEXT:  ; %bb.3: ; %bb.2
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[12:13]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[12:13]
 ; GFX942-NEXT:  .LBB9_4: ; %bb.3
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    global_store_dwordx2 v0, v[4:5], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 entry:

>From 7955d7c4bf3106a60cc1d20e186c172bff451ee5 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <Janek.vanOirschot at amd.com>
Date: Thu, 31 Jul 2025 09:04:58 -0700
Subject: [PATCH 2/2] add todo and pre-size smallvector instead of push_back

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9dfc6b154d6f1..c27cef1134f6c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16060,6 +16060,7 @@ SDValue SITargetLowering::performSelectCombine(SDNode *N,
 SDValue
 SITargetLowering::performBuildVectorCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
+  // TODO: legalize for all targets instead of just v_mov_b64 enabled ones.
   const GCNSubtarget *ST = getSubtarget();
   if (DCI.Level < AfterLegalizeDAG || !ST->hasMovB64())
     return SDValue();
@@ -16131,9 +16132,9 @@ SITargetLowering::performBuildVectorCombine(SDNode *N,
 
   // Construct and return build_vector with 64b elements.
   if (!ImmVals.empty()) {
-    SmallVector<SDValue, 8> VectorConsts;
-    for (uint64_t I : ImmVals)
-      VectorConsts.push_back(DAG.getConstant(I, SL, MVT::i64));
+    SmallVector<SDValue, 8> VectorConsts(ImmVals.size());
+    for (unsigned i = 0; i < ImmVals.size(); ++i)
+      VectorConsts[i] = DAG.getConstant(ImmVals[i], SL, MVT::i64);
     unsigned NewNumElts = SizeBits / 64;
     LLVMContext &Ctx = *DAG.getContext();
     EVT NewVT = EVT::getVectorVT(Ctx, MVT::i64, NewNumElts);



More information about the llvm-commits mailing list